From 0357f496ca3d40e70545160652d07e34794775bc Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 09:09:25 +0800 Subject: [PATCH 01/28] add vld2 neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 95 +++ .../src/arm_shared/neon/generated.rs | 540 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 22 +- 3 files changed, 646 insertions(+), 11 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 392f9d7d9b..d3c2d493bf 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4592,6 +4592,61 @@ pub unsafe fn vld1q_f64_x4(a: *const f64) -> float64x2x4_t { vld1q_f64_x4_(a) } +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2))] +pub unsafe fn vld2q_s64(a: *const i64) -> int64x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2i64.p0v2i64")] + fn vld2q_s64_(ptr: *const int64x2_t) -> int64x2x2_t; + } + vld2q_s64_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2))] +pub unsafe fn vld2q_u64(a: *const u64) -> uint64x2x2_t { + transmute(vld2q_s64(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld2))] +pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t { + transmute(vld2q_s64(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2))] +pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1f64.p0v1f64")] + fn vld2_f64_(ptr: *const float64x1_t) -> float64x1x2_t; + } + vld2_f64_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2))] +pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2f64.p0v2f64")] + fn vld2q_f64_(ptr: *const float64x2_t) -> float64x2x2_t; + } + vld2q_f64_(a.cast()) +} + /// Store multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -13061,6 +13116,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 3]; + let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 3)]; + let r: [i64x2; 2] = transmute(vld2q_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(2, 3)]; + let r: [u64x2; 2] = transmute(vld2q_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 3)]; + let r: [i64x2; 2] = transmute(vld2q_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let e: [f64; 2] = [1., 2.]; + let r: [f64; 2] = transmute(vld2_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 3.]; + let e: [f64x2; 2] = [f64x2::new(1., 2.), f64x2::new(2., 3.)]; + let r: [f64x2; 2] = transmute(vld2q_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_f64_x2() { let a: [f64; 3] = [0., 1., 2.]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 616aad8ac4..b0f30d9ee5 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -6758,6 +6758,378 @@ pub unsafe fn vld1q_f32_x4(a: *const f32) -> float32x4x4_t { vld1q_f32_x4_(a) } +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i8.p0i8")] + fn vld2_s8_(ptr: *const i8, size: i32) -> int8x8x2_t; + } +vld2_s8_(a as *const i8, 1) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_s8(a: *const i8) -> int8x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v8i8.p0v8i8")] + fn vld2_s8_(ptr: *const int8x8_t) -> int8x8x2_t; + } +vld2_s8_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i16.p0i8")] + fn vld2_s16_(ptr: *const i8, size: i32) -> int16x4x2_t; + } +vld2_s16_(a as *const i8, 2) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_s16(a: *const i16) -> int16x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4i16.p0v4i16")] + fn vld2_s16_(ptr: *const int16x4_t) -> int16x4x2_t; + } +vld2_s16_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2i32.p0i8")] + fn vld2_s32_(ptr: *const i8, size: i32) -> int32x2x2_t; + } +vld2_s32_(a as *const i8, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2i32.p0v2i32")] + fn vld2_s32_(ptr: *const int32x2_t) -> int32x2x2_t; + } +vld2_s32_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v1i64.p0i8")] + fn vld2_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; + } +vld2_s64_(a as *const i8, 8) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1i64.p0v1i64")] + fn vld2_s64_(ptr: *const int64x1_t) -> int64x1x2_t; + } +vld2_s64_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v16i8.p0i8")] + fn vld2q_s8_(ptr: *const i8, size: i32) -> int8x16x2_t; + } +vld2q_s8_(a as *const i8, 1) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_s8(a: *const i8) -> int8x16x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v16i8.p0v16i8")] + fn vld2q_s8_(ptr: *const int8x16_t) -> int8x16x2_t; + } +vld2q_s8_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v8i16.p0i8")] + fn vld2q_s16_(ptr: *const i8, size: i32) -> int16x8x2_t; + } +vld2q_s16_(a as *const i8, 2) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_s16(a: *const i16) -> int16x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v8i16.p0v8i16")] + fn vld2q_s16_(ptr: *const int16x8_t) -> int16x8x2_t; + } +vld2q_s16_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4i32.p0i8")] + fn vld2q_s32_(ptr: *const i8, size: i32) -> int32x4x2_t; + } +vld2q_s32_(a as *const i8, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_s32(a: *const i32) -> int32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4i32.p0v4i32")] + fn vld2q_s32_(ptr: *const int32x4_t) -> int32x4x2_t; + } +vld2q_s32_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t { + transmute(vld2_s8(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t { + transmute(vld2_s16(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t { + transmute(vld2_s32(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { + transmute(vld2_s64(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_u8(a: *const u8) -> uint8x16x2_t { + transmute(vld2q_s8(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_u16(a: *const u16) -> uint16x8x2_t { + transmute(vld2q_s16(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_u32(a: *const u32) -> uint32x4x2_t { + transmute(vld2q_s32(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_p8(a: *const p8) -> poly8x8x2_t { + transmute(vld2_s8(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_p16(a: *const p16) -> poly16x4x2_t { + transmute(vld2_s16(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_p8(a: *const p8) -> poly8x16x2_t { + transmute(vld2q_s8(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t { + transmute(vld2q_s16(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t { + transmute(vld2_s64(transmute(a))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v2f32.p0i8")] + fn vld2_f32_(ptr: *const i8, size: i32) -> float32x2x2_t; + } +vld2_f32_(a as *const i8, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_f32(a: *const f32) -> float32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v2f32.p0v2f32")] + fn vld2_f32_(ptr: *const float32x2_t) -> float32x2x2_t; + } +vld2_f32_(a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v4f32.p0i8")] + fn vld2q_f32_(ptr: *const i8, size: i32) -> float32x4x2_t; + } +vld2q_f32_(a as *const i8, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v4f32.p0v4f32")] + fn vld2q_f32_(ptr: *const float32x4_t) -> float32x4x2_t; + } +vld2q_f32_(a.cast()) +} + /// Store multiple single-element structures from one, two, three, or four registers #[inline] #[cfg(target_arch = "arm")] @@ -21697,6 +22069,174 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_s8() { + let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 3, 2, 3, 4, 5), i8x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [i8x8; 2] = transmute(vld2_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_s16() { + let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5]; + let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 3), i16x4::new(2, 3, 4, 5)]; + let r: [i16x4; 2] = transmute(vld2_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_s32() { + let a: [i32; 5] = [0, 1, 2, 2, 3]; + let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(2, 3)]; + let r: [i32x2; 2] = transmute(vld2_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld2_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)]; + let r: [i8x16; 2] = transmute(vld2q_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 3, 2, 3, 4, 5), i16x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [i16x8; 2] = transmute(vld2q_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5]; + let e: [i32x4; 2] = [i32x4::new(1, 2, 2, 3), i32x4::new(2, 3, 4, 5)]; + let r: [i32x4; 2] = transmute(vld2q_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_u8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [u8x8; 2] = [u8x8::new(1, 2, 2, 3, 2, 3, 4, 5), u8x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [u8x8; 2] = transmute(vld2_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_u16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5]; + let e: [u16x4; 2] = [u16x4::new(1, 2, 2, 3), u16x4::new(2, 3, 4, 5)]; + let r: [u16x4; 2] = transmute(vld2_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_u32() { + let a: [u32; 5] = [0, 1, 2, 2, 3]; + let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(2, 3)]; + let r: [u32x2; 2] = transmute(vld2_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)]; + let r: [u64x1; 2] = transmute(vld2_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [u8x16; 2] = [u8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), u8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)]; + let r: [u8x16; 2] = transmute(vld2q_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [u16x8; 2] = [u16x8::new(1, 2, 2, 3, 2, 3, 4, 5), u16x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [u16x8; 2] = transmute(vld2q_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5]; + let e: [u32x4; 2] = [u32x4::new(1, 2, 2, 3), u32x4::new(2, 3, 4, 5)]; + let r: [u32x4; 2] = transmute(vld2q_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_p8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 3, 2, 3, 4, 5), i8x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [i8x8; 2] = transmute(vld2_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_p16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 4, 3, 5]; + let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 3), i16x4::new(2, 3, 4, 5)]; + let r: [i16x4; 2] = transmute(vld2_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9), i8x16::new(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17)]; + let r: [i8x16; 2] = transmute(vld2q_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 3, 2, 3, 4, 5), i16x8::new(2, 3, 4, 5, 6, 7, 8, 9)]; + let r: [i16x8; 2] = transmute(vld2q_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_p64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld2_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_f32() { + let a: [f32; 5] = [0., 1., 2., 2., 3.]; + let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(2., 3.)]; + let r: [f32x2; 2] = transmute(vld2_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 4., 3., 5.]; + let e: [f32x4; 2] = [f32x4::new(1., 2., 2., 3.), f32x4::new(2., 3., 4., 5.)]; + let r: [f32x4; 2] = transmute(vld2q_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_s8_x2() { let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index e2d8f80dfb..fcbee79f2a 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2125,12 +2125,12 @@ arm-aarch64-separate aarch64 = ld2 link-aarch64 = ld2._EXTv2_ -//generate *const i64:int64x2x2_t +generate *const i64:int64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ -//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t -//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 @@ -2141,17 +2141,17 @@ validate 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, load_fn aarch64 = ld2 -//generate *const u64:uint64x2x2_t +generate *const u64:uint64x2x2_t target = aes -//generate *const p64:poly64x2x2_t +generate *const p64:poly64x2x2_t target = default arm = vld2 -//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t -//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t -//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t +generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t target = aes -//generate *const p64:poly64x1x2_t +generate *const p64:poly64x1x2_t /// Load multiple 2-element structures to two registers name = vld2 @@ -2163,11 +2163,11 @@ arm-aarch64-separate aarch64 = ld2 link-aarch64 = ld2._EXTv2_ -//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t +generate *const f64:float64x1x2_t, *const f64:float64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ -//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t +generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 From b7fe9aa74a72b2fc53007017ff7a8c11f304cfdd Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 09:20:34 +0800 Subject: [PATCH 02/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 110 +++++++++--------- crates/stdarch-gen/neon.spec | 9 +- 2 files changed, 62 insertions(+), 57 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index b0f30d9ee5..db22582c09 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -6842,34 +6842,6 @@ pub unsafe fn vld2_s32(a: *const i32) -> int32x2x2_t { vld2_s32_(a.cast()) } -/// Load multiple 2-element structures to two registers -#[inline] -#[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] -pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v1i64.p0i8")] - fn vld2_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; - } -vld2_s64_(a as *const i8, 8) -} - -/// Load multiple 2-element structures to two registers -#[inline] -#[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] -pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1i64.p0v1i64")] - fn vld2_s64_(ptr: *const int64x1_t) -> int64x1x2_t; - } -vld2_s64_(a.cast()) -} - /// Load multiple 2-element structures to two registers #[inline] #[cfg(target_arch = "arm")] @@ -6956,12 +6928,30 @@ vld2q_s32_(a.cast()) /// Load multiple 2-element structures to two registers #[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] +pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2.v1i64.p0i8")] + fn vld2_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; + } +vld2_s64_(a as *const i8, 8) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] -pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t { - transmute(vld2_s8(transmute(a))) +pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2.v1i64.p0v1i64")] + fn vld2_s64_(ptr: *const int64x1_t) -> int64x1x2_t; + } +vld2_s64_(a.cast()) } /// Load multiple 2-element structures to two registers @@ -6970,8 +6960,8 @@ pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] -pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t { - transmute(vld2_s16(transmute(a))) +pub unsafe fn vld2_u8(a: *const u8) -> uint8x8x2_t { + transmute(vld2_s8(transmute(a))) } /// Load multiple 2-element structures to two registers @@ -6980,8 +6970,8 @@ pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] -pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t { - transmute(vld2_s32(transmute(a))) +pub unsafe fn vld2_u16(a: *const u16) -> uint16x4x2_t { + transmute(vld2_s16(transmute(a))) } /// Load multiple 2-element structures to two registers @@ -6990,8 +6980,8 @@ pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] -pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { - transmute(vld2_s64(transmute(a))) +pub unsafe fn vld2_u32(a: *const u32) -> uint32x2x2_t { + transmute(vld2_s32(transmute(a))) } /// Load multiple 2-element structures to two registers @@ -7064,11 +7054,21 @@ pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t { transmute(vld2q_s16(transmute(a))) } +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { + transmute(vld2_s64(transmute(a))) +} + /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t { transmute(vld2_s64(transmute(a))) @@ -22093,14 +22093,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld2_s64() { - let a: [i64; 3] = [0, 1, 2]; - let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; - let r: [i64x1; 2] = transmute(vld2_s64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld2q_s8() { let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; @@ -22125,6 +22117,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld2_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld2_u8() { let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; @@ -22149,14 +22149,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld2_u64() { - let a: [u64; 3] = [0, 1, 2]; - let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)]; - let r: [u64x1; 2] = transmute(vld2_u64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld2q_u8() { let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; @@ -22213,6 +22205,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)]; + let r: [u64x1; 2] = transmute(vld2_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld2_p64() { let a: [u64; 3] = [0, 1, 2]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index fcbee79f2a..a9c1ea6a27 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2129,8 +2129,10 @@ generate *const i64:int64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ -generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +arm = vld +generate *const i64:int64x1x2_t /// Load multiple 2-element structures to two registers name = vld2 @@ -2147,12 +2149,15 @@ generate *const p64:poly64x2x2_t target = default arm = vld2 -generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +arm = vld +generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t + /// Load multiple 2-element structures to two registers name = vld2 out-nox From 2b0d1aa1b7d96fdbb63f832c73dddcc12166f617 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 09:27:36 +0800 Subject: [PATCH 03/28] correct assert_instr --- crates/core_arch/src/aarch64/neon/generated.rs | 2 +- crates/core_arch/src/arm_shared/neon/generated.rs | 6 +++--- crates/stdarch-gen/neon.spec | 8 ++++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index d3c2d493bf..8997576864 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4624,7 +4624,7 @@ pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t { /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2))] +#[cfg_attr(test, assert_instr(ld))] pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index db22582c09..9d8fbe9b05 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -6944,7 +6944,7 @@ vld2_s64_(a as *const i8, 8) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7059,7 +7059,7 @@ pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t { #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { transmute(vld2_s64(transmute(a))) } @@ -7069,7 +7069,7 @@ pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t { transmute(vld2_s64(transmute(a))) } diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index a9c1ea6a27..e60e49cf5b 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2132,6 +2132,7 @@ link-arm = vld2._EXTpi82_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t arm = vld +aarch64 = ld generate *const i64:int64x1x2_t /// Load multiple 2-element structures to two registers @@ -2153,6 +2154,7 @@ generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t arm = vld +aarch64 = ld generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t @@ -2166,9 +2168,11 @@ validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. load_fn arm-aarch64-separate -aarch64 = ld2 +aarch64 = ld link-aarch64 = ld2._EXTv2_ -generate *const f64:float64x1x2_t, *const f64:float64x2x2_t +generate *const f64:float64x1x2_t +aarch64 = ld2 +generate *const f64:float64x2x2_t arm = vld2 link-arm = vld2._EXTpi82_ From 22e6d114607f1de1247a353a2993775b4ddf0ccf Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 10:13:43 +0800 Subject: [PATCH 04/28] add vld3 and vld4 neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 190 +++ .../src/arm_shared/neon/generated.rs | 1080 +++++++++++++++++ crates/stdarch-gen/neon.spec | 44 +- 3 files changed, 1292 insertions(+), 22 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 8997576864..a39c84c605 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4647,6 +4647,116 @@ pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t { vld2q_f64_(a.cast()) } +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3))] +pub unsafe fn vld3q_s64(a: *const i64) -> int64x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2i64.p0v2i64")] + fn vld3q_s64_(ptr: *const int64x2_t) -> int64x2x3_t; + } + vld3q_s64_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3))] +pub unsafe fn vld3q_u64(a: *const u64) -> uint64x2x3_t { + transmute(vld3q_s64(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld3))] +pub unsafe fn vld3q_p64(a: *const p64) -> poly64x2x3_t { + transmute(vld3q_s64(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3))] +pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1f64.p0v1f64")] + fn vld3_f64_(ptr: *const float64x1_t) -> float64x1x3_t; + } + vld3_f64_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3))] +pub unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2f64.p0v2f64")] + fn vld3q_f64_(ptr: *const float64x2_t) -> float64x2x3_t; + } + vld3q_f64_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4))] +pub unsafe fn vld4q_s64(a: *const i64) -> int64x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2i64.p0v2i64")] + fn vld4q_s64_(ptr: *const int64x2_t) -> int64x2x4_t; + } + vld4q_s64_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4))] +pub unsafe fn vld4q_u64(a: *const u64) -> uint64x2x4_t { + transmute(vld4q_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld4))] +pub unsafe fn vld4q_p64(a: *const p64) -> poly64x2x4_t { + transmute(vld4q_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4))] +pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1f64.p0v1f64")] + fn vld4_f64_(ptr: *const float64x1_t) -> float64x1x4_t; + } + vld4_f64_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4))] +pub unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f64.p0v2f64")] + fn vld4q_f64_(ptr: *const float64x2_t) -> float64x2x4_t; + } + vld4q_f64_(a.cast()) +} + /// Store multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -13156,6 +13266,86 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_s64() { + let a: [i64; 7] = [0, 1, 2, 2, 2, 4, 4]; + let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 4), i64x2::new(2, 4)]; + let r: [i64x2; 3] = transmute(vld3q_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_u64() { + let a: [u64; 7] = [0, 1, 2, 2, 2, 4, 4]; + let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(2, 4), u64x2::new(2, 4)]; + let r: [u64x2; 3] = transmute(vld3q_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_p64() { + let a: [u64; 7] = [0, 1, 2, 2, 2, 4, 4]; + let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 4), i64x2::new(2, 4)]; + let r: [i64x2; 3] = transmute(vld3q_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_f64() { + let a: [f64; 4] = [0., 1., 2., 2.]; + let e: [f64; 3] = [1., 2., 2.]; + let r: [f64; 3] = transmute(vld3_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_f64() { + let a: [f64; 7] = [0., 1., 2., 2., 2., 4., 4.]; + let e: [f64x2; 3] = [f64x2::new(1., 2.), f64x2::new(2., 4.), f64x2::new(2., 4.)]; + let r: [f64x2; 3] = transmute(vld3q_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_s64() { + let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 6), i64x2::new(2, 6), i64x2::new(6, 8)]; + let r: [i64x2; 4] = transmute(vld4q_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_u64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(2, 6), u64x2::new(2, 6), u64x2::new(6, 8)]; + let r: [u64x2; 4] = transmute(vld4q_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_p64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 6), i64x2::new(2, 6), i64x2::new(6, 8)]; + let r: [i64x2; 4] = transmute(vld4q_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 6.]; + let e: [f64; 4] = [1., 2., 2., 6.]; + let r: [f64; 4] = transmute(vld4_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_f64() { + let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f64x2; 4] = [f64x2::new(1., 2.), f64x2::new(2., 6.), f64x2::new(2., 6.), f64x2::new(6., 8.)]; + let r: [f64x2; 4] = transmute(vld4q_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_f64_x2() { let a: [f64; 3] = [0., 1., 2.]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 9d8fbe9b05..7bc3482a56 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7130,6 +7130,750 @@ pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t { vld2q_f32_(a.cast()) } +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i8.p0i8")] + fn vld3_s8_(ptr: *const i8, size: i32) -> int8x8x3_t; + } +vld3_s8_(a as *const i8, 1) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_s8(a: *const i8) -> int8x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v8i8.p0v8i8")] + fn vld3_s8_(ptr: *const int8x8_t) -> int8x8x3_t; + } +vld3_s8_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i16.p0i8")] + fn vld3_s16_(ptr: *const i8, size: i32) -> int16x4x3_t; + } +vld3_s16_(a as *const i8, 2) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_s16(a: *const i16) -> int16x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4i16.p0v4i16")] + fn vld3_s16_(ptr: *const int16x4_t) -> int16x4x3_t; + } +vld3_s16_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2i32.p0i8")] + fn vld3_s32_(ptr: *const i8, size: i32) -> int32x2x3_t; + } +vld3_s32_(a as *const i8, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2i32.p0v2i32")] + fn vld3_s32_(ptr: *const int32x2_t) -> int32x2x3_t; + } +vld3_s32_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v1i64.p0i8")] + fn vld3_s64_(ptr: *const i8, size: i32) -> int64x1x3_t; + } +vld3_s64_(a as *const i8, 8) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1i64.p0v1i64")] + fn vld3_s64_(ptr: *const int64x1_t) -> int64x1x3_t; + } +vld3_s64_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v16i8.p0i8")] + fn vld3q_s8_(ptr: *const i8, size: i32) -> int8x16x3_t; + } +vld3q_s8_(a as *const i8, 1) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_s8(a: *const i8) -> int8x16x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v16i8.p0v16i8")] + fn vld3q_s8_(ptr: *const int8x16_t) -> int8x16x3_t; + } +vld3q_s8_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v8i16.p0i8")] + fn vld3q_s16_(ptr: *const i8, size: i32) -> int16x8x3_t; + } +vld3q_s16_(a as *const i8, 2) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_s16(a: *const i16) -> int16x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v8i16.p0v8i16")] + fn vld3q_s16_(ptr: *const int16x8_t) -> int16x8x3_t; + } +vld3q_s16_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4i32.p0i8")] + fn vld3q_s32_(ptr: *const i8, size: i32) -> int32x4x3_t; + } +vld3q_s32_(a as *const i8, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4i32.p0v4i32")] + fn vld3q_s32_(ptr: *const int32x4_t) -> int32x4x3_t; + } +vld3q_s32_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_u8(a: *const u8) -> uint8x8x3_t { + transmute(vld3_s8(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_u16(a: *const u16) -> uint16x4x3_t { + transmute(vld3_s16(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t { + transmute(vld3_s32(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t { + transmute(vld3_s64(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_u8(a: *const u8) -> uint8x16x3_t { + transmute(vld3q_s8(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_u16(a: *const u16) -> uint16x8x3_t { + transmute(vld3q_s16(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_u32(a: *const u32) -> uint32x4x3_t { + transmute(vld3q_s32(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_p8(a: *const p8) -> poly8x8x3_t { + transmute(vld3_s8(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_p16(a: *const p16) -> poly16x4x3_t { + transmute(vld3_s16(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_p8(a: *const p8) -> poly8x16x3_t { + transmute(vld3q_s8(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t { + transmute(vld3q_s16(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t { + transmute(vld3_s64(transmute(a))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v2f32.p0i8")] + fn vld3_f32_(ptr: *const i8, size: i32) -> float32x2x3_t; + } +vld3_f32_(a as *const i8, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3_f32(a: *const f32) -> float32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v2f32.p0v2f32")] + fn vld3_f32_(ptr: *const float32x2_t) -> float32x2x3_t; + } +vld3_f32_(a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v4f32.p0i8")] + fn vld3q_f32_(ptr: *const i8, size: i32) -> float32x4x3_t; + } +vld3q_f32_(a as *const i8, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v4f32.p0v4f32")] + fn vld3q_f32_(ptr: *const float32x4_t) -> float32x4x3_t; + } +vld3q_f32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i8.p0i8")] + fn vld4_s8_(ptr: *const i8, size: i32) -> int8x8x4_t; + } +vld4_s8_(a as *const i8, 1) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_s8(a: *const i8) -> int8x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v8i8.p0v8i8")] + fn vld4_s8_(ptr: *const int8x8_t) -> int8x8x4_t; + } +vld4_s8_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i16.p0i8")] + fn vld4_s16_(ptr: *const i8, size: i32) -> int16x4x4_t; + } +vld4_s16_(a as *const i8, 2) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_s16(a: *const i16) -> int16x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4i16.p0v4i16")] + fn vld4_s16_(ptr: *const int16x4_t) -> int16x4x4_t; + } +vld4_s16_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2i32.p0i8")] + fn vld4_s32_(ptr: *const i8, size: i32) -> int32x2x4_t; + } +vld4_s32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2i32.p0v2i32")] + fn vld4_s32_(ptr: *const int32x2_t) -> int32x2x4_t; + } +vld4_s32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v1i64.p0i8")] + fn vld4_s64_(ptr: *const i8, size: i32) -> int64x1x4_t; + } +vld4_s64_(a as *const i8, 8) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")] + fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t; + } +vld4_s64_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v16i8.p0i8")] + fn vld4q_s8_(ptr: *const i8, size: i32) -> int8x16x4_t; + } +vld4q_s8_(a as *const i8, 1) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_s8(a: *const i8) -> int8x16x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v16i8.p0v16i8")] + fn vld4q_s8_(ptr: *const int8x16_t) -> int8x16x4_t; + } +vld4q_s8_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v8i16.p0i8")] + fn vld4q_s16_(ptr: *const i8, size: i32) -> int16x8x4_t; + } +vld4q_s16_(a as *const i8, 2) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_s16(a: *const i16) -> int16x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v8i16.p0v8i16")] + fn vld4q_s16_(ptr: *const int16x8_t) -> int16x8x4_t; + } +vld4q_s16_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4i32.p0i8")] + fn vld4q_s32_(ptr: *const i8, size: i32) -> int32x4x4_t; + } +vld4q_s32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4i32.p0v4i32")] + fn vld4q_s32_(ptr: *const int32x4_t) -> int32x4x4_t; + } +vld4q_s32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t { + transmute(vld4_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t { + transmute(vld4_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t { + transmute(vld4_s32(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t { + transmute(vld4_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t { + transmute(vld4q_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t { + transmute(vld4q_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t { + transmute(vld4q_s32(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t { + transmute(vld4_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t { + transmute(vld4_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t { + transmute(vld4q_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t { + transmute(vld4q_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { + transmute(vld4_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2f32.p0i8")] + fn vld4_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; + } +vld4_f32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f32.p0v2f32")] + fn vld4_f32_(ptr: *const float32x2_t) -> float32x2x4_t; + } +vld4_f32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f32.p0i8")] + fn vld4q_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; + } +vld4q_f32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4f32.p0v4f32")] + fn vld4q_f32_(ptr: *const float32x4_t) -> float32x4x4_t; + } +vld4q_f32_(a.cast()) +} + /// Store multiple single-element structures from one, two, three, or four registers #[inline] #[cfg(target_arch = "arm")] @@ -22237,6 +22981,342 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3_s8() { + let a: [i8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 4, 2, 4, 7, 8), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i8x8; 3] = transmute(vld3_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_s16() { + let a: [i16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 4), i16x4::new(2, 4, 7, 8), i16x4::new(2, 4, 7, 8)]; + let r: [i16x4; 3] = transmute(vld3_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_s32() { + let a: [i32; 7] = [0, 1, 2, 2, 2, 4, 4]; + let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(2, 4), i32x2::new(2, 4)]; + let r: [i32x2; 3] = transmute(vld3_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_s64() { + let a: [i64; 4] = [0, 1, 2, 2]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 3] = transmute(vld3_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_s8() { + let a: [i8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)]; + let r: [i8x16; 3] = transmute(vld3q_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_s16() { + let a: [i16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 4, 2, 4, 7, 8), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i16x8; 3] = transmute(vld3q_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_s32() { + let a: [i32; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let e: [i32x4; 3] = [i32x4::new(1, 2, 2, 4), i32x4::new(2, 4, 7, 8), i32x4::new(2, 4, 7, 8)]; + let r: [i32x4; 3] = transmute(vld3q_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_u8() { + let a: [u8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [u8x8; 3] = [u8x8::new(1, 2, 2, 4, 2, 4, 7, 8), u8x8::new(2, 4, 7, 8, 13, 14, 15, 16), u8x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [u8x8; 3] = transmute(vld3_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_u16() { + let a: [u16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let e: [u16x4; 3] = [u16x4::new(1, 2, 2, 4), u16x4::new(2, 4, 7, 8), u16x4::new(2, 4, 7, 8)]; + let r: [u16x4; 3] = transmute(vld3_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_u32() { + let a: [u32; 7] = [0, 1, 2, 2, 2, 4, 4]; + let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(2, 4), u32x2::new(2, 4)]; + let r: [u32x2; 3] = transmute(vld3_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_u64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)]; + let r: [u64x1; 3] = transmute(vld3_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_u8() { + let a: [u8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let e: [u8x16; 3] = [u8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), u8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), u8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)]; + let r: [u8x16; 3] = transmute(vld3q_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_u16() { + let a: [u16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [u16x8; 3] = [u16x8::new(1, 2, 2, 4, 2, 4, 7, 8), u16x8::new(2, 4, 7, 8, 13, 14, 15, 16), u16x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [u16x8; 3] = transmute(vld3q_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_u32() { + let a: [u32; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let e: [u32x4; 3] = [u32x4::new(1, 2, 2, 4), u32x4::new(2, 4, 7, 8), u32x4::new(2, 4, 7, 8)]; + let r: [u32x4; 3] = transmute(vld3q_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_p8() { + let a: [u8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 4, 2, 4, 7, 8), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16), i8x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i8x8; 3] = transmute(vld3_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_p16() { + let a: [u16; 13] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 4), i16x4::new(2, 4, 7, 8), i16x4::new(2, 4, 7, 8)]; + let r: [i16x4; 3] = transmute(vld3_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_p8() { + let a: [u8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32), i8x16::new(2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48)]; + let r: [i8x16; 3] = transmute(vld3q_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_p16() { + let a: [u16; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 4, 2, 4, 7, 8), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16), i16x8::new(2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i16x8; 3] = transmute(vld3q_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_p64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 3] = transmute(vld3_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_f32() { + let a: [f32; 7] = [0., 1., 2., 2., 2., 4., 4.]; + let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(2., 4.), f32x2::new(2., 4.)]; + let r: [f32x2; 3] = transmute(vld3_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_f32() { + let a: [f32; 13] = [0., 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.]; + let e: [f32x4; 3] = [f32x4::new(1., 2., 2., 4.), f32x4::new(2., 4., 7., 8.), f32x4::new(2., 4., 7., 8.)]; + let r: [f32x4; 3] = transmute(vld3q_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 6, 2, 6, 6, 8), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [i8x8; 4] = transmute(vld4_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 6), i16x4::new(2, 6, 6, 8), i16x4::new(2, 6, 6, 8), i16x4::new(6, 8, 8, 16)]; + let r: [i16x4; 4] = transmute(vld4_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(2, 6), i32x2::new(2, 6), i32x2::new(6, 8)]; + let r: [i32x2; 4] = transmute(vld4_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 6]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)]; + let r: [i64x1; 4] = transmute(vld4_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_s8() { + let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), i8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)]; + let r: [i8x16; 4] = transmute(vld4q_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_s16() { + let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 6, 2, 6, 6, 8), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [i16x8; 4] = transmute(vld4q_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_s32() { + let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i32x4; 4] = [i32x4::new(1, 2, 2, 6), i32x4::new(2, 6, 6, 8), i32x4::new(2, 6, 6, 8), i32x4::new(6, 8, 8, 16)]; + let r: [i32x4; 4] = transmute(vld4q_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u8x8; 4] = [u8x8::new(1, 2, 2, 6, 2, 6, 6, 8), u8x8::new(2, 6, 6, 8, 6, 8, 8, 16), u8x8::new(2, 6, 6, 8, 6, 8, 8, 16), u8x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [u8x8; 4] = transmute(vld4_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u16x4; 4] = [u16x4::new(1, 2, 2, 6), u16x4::new(2, 6, 6, 8), u16x4::new(2, 6, 6, 8), u16x4::new(6, 8, 8, 16)]; + let r: [u16x4; 4] = transmute(vld4_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(2, 6), u32x2::new(2, 6), u32x2::new(6, 8)]; + let r: [u32x2; 4] = transmute(vld4_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(6)]; + let r: [u64x1; 4] = transmute(vld4_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_u8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [u8x16; 4] = [u8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), u8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), u8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), u8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)]; + let r: [u8x16; 4] = transmute(vld4q_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_u16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u16x8; 4] = [u16x8::new(1, 2, 2, 6, 2, 6, 6, 8), u16x8::new(2, 6, 6, 8, 6, 8, 8, 16), u16x8::new(2, 6, 6, 8, 6, 8, 8, 16), u16x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [u16x8; 4] = transmute(vld4q_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_u32() { + let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u32x4; 4] = [u32x4::new(1, 2, 2, 6), u32x4::new(2, 6, 6, 8), u32x4::new(2, 6, 6, 8), u32x4::new(6, 8, 8, 16)]; + let r: [u32x4; 4] = transmute(vld4q_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 6, 2, 6, 6, 8), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(2, 6, 6, 8, 6, 8, 8, 16), i8x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [i8x8; 4] = transmute(vld4_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 6), i16x4::new(2, 6, 6, 8), i16x4::new(2, 6, 6, 8), i16x4::new(6, 8, 8, 16)]; + let r: [i16x4; 4] = transmute(vld4_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_p8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32), i8x16::new(2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48), i8x16::new(6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64)]; + let r: [i8x16; 4] = transmute(vld4q_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_p16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 6, 2, 6, 6, 8), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(2, 6, 6, 8, 6, 8, 8, 16), i16x8::new(6, 8, 8, 16, 8, 16, 16, 32)]; + let r: [i16x8; 4] = transmute(vld4q_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)]; + let r: [i64x1; 4] = transmute(vld4_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(2., 6.), f32x2::new(2., 6.), f32x2::new(6., 8.)]; + let r: [f32x2; 4] = transmute(vld4_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_f32() { + let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 15., 16.]; + let e: [f32x4; 4] = [f32x4::new(1., 2., 2., 6.), f32x4::new(2., 6., 6., 8.), f32x4::new(2., 6., 6., 15.), f32x4::new(6., 8., 8., 16.)]; + let r: [f32x4; 4] = transmute(vld4q_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_s8_x2() { let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index e60e49cf5b..4abb5ad7d2 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2314,12 +2314,12 @@ arm-aarch64-separate aarch64 = ld3 link-aarch64 = ld3._EXTv2_ -//generate *const i64:int64x2x3_t +generate *const i64:int64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ -//generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t -//generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t +generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t /// Load multiple 3-element structures to three registers name = vld3 @@ -2330,17 +2330,17 @@ validate 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, load_fn aarch64 = ld3 -//generate *const u64:uint64x2x3_t +generate *const u64:uint64x2x3_t target = aes -//generate *const p64:poly64x2x3_t +generate *const p64:poly64x2x3_t target = default arm = vld3 -//generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t -//generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t -//generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t +generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t +generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t target = aes -//generate *const p64:poly64x1x3_t +generate *const p64:poly64x1x3_t /// Load multiple 3-element structures to three registers name = vld3 @@ -2352,11 +2352,11 @@ arm-aarch64-separate aarch64 = ld3 link-aarch64 = ld3._EXTv2_ -//generate *const f64:float64x1x3_t, *const f64:float64x2x3_t +generate *const f64:float64x1x3_t, *const f64:float64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ -//generate *const f32:float32x2x3_t, *const f32:float32x4x3_t +generate *const f32:float32x2x3_t, *const f32:float32x4x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 @@ -2491,12 +2491,12 @@ arm-aarch64-separate aarch64 = ld4 link-aarch64 = ld4._EXTv2_ -//generate *const i64:int64x2x4_t +generate *const i64:int64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ -//generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t -//generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t +generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2507,17 +2507,17 @@ validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 1 load_fn aarch64 = ld4 -//generate *const u64:uint64x2x4_t +generate *const u64:uint64x2x4_t target = aes -//generate *const p64:poly64x2x4_t +generate *const p64:poly64x2x4_t target = default arm = vld4 -//generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t -//generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t -//generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t +generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t +generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t target = aes -//generate *const p64:poly64x1x4_t +generate *const p64:poly64x1x4_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2529,11 +2529,11 @@ arm-aarch64-separate aarch64 = ld4 link-aarch64 = ld4._EXTv2_ -//generate *const f64:float64x1x4_t, *const f64:float64x2x4_t +generate *const f64:float64x1x4_t, *const f64:float64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ -//generate *const f32:float32x2x4_t, *const f32:float32x4x4_t +generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 From b69d3e748a0b458a72bc051eb9bfb94100e1649e Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 10:25:48 +0800 Subject: [PATCH 05/28] correct assert_instr --- .../core_arch/src/aarch64/neon/generated.rs | 6 +- .../src/arm_shared/neon/generated.rs | 236 +++++++++--------- crates/stdarch-gen/neon.spec | 42 +++- 3 files changed, 150 insertions(+), 134 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index a39c84c605..ba83c269a9 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4624,7 +4624,7 @@ pub unsafe fn vld2q_p64(a: *const p64) -> poly64x2x2_t { /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld))] +#[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld2_f64(a: *const f64) -> float64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -4679,7 +4679,7 @@ pub unsafe fn vld3q_p64(a: *const p64) -> poly64x2x3_t { /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld3))] +#[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld3_f64(a: *const f64) -> float64x1x3_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -4734,7 +4734,7 @@ pub unsafe fn vld4q_p64(a: *const p64) -> poly64x2x4_t { /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld4))] +#[cfg_attr(test, assert_instr(nop))] pub unsafe fn vld4_f64(a: *const f64) -> float64x1x4_t { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 7bc3482a56..002fe2d841 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -6930,7 +6930,7 @@ vld2q_s32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -6944,7 +6944,7 @@ vld2_s64_(a as *const i8, 8) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vld2_s64(a: *const i64) -> int64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7058,8 +7058,8 @@ pub unsafe fn vld2q_p16(a: *const p16) -> poly16x8x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { transmute(vld2_s64(transmute(a))) } @@ -7068,8 +7068,8 @@ pub unsafe fn vld2_u64(a: *const u64) -> uint64x1x2_t { #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vld2_p64(a: *const p64) -> poly64x1x2_t { transmute(vld2_s64(transmute(a))) } @@ -7214,34 +7214,6 @@ pub unsafe fn vld3_s32(a: *const i32) -> int32x2x3_t { vld3_s32_(a.cast()) } -/// Load multiple 3-element structures to three registers -#[inline] -#[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] -pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v1i64.p0i8")] - fn vld3_s64_(ptr: *const i8, size: i32) -> int64x1x3_t; - } -vld3_s64_(a as *const i8, 8) -} - -/// Load multiple 3-element structures to three registers -#[inline] -#[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] -pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1i64.p0v1i64")] - fn vld3_s64_(ptr: *const int64x1_t) -> int64x1x3_t; - } -vld3_s64_(a.cast()) -} - /// Load multiple 3-element structures to three registers #[inline] #[cfg(target_arch = "arm")] @@ -7326,6 +7298,34 @@ pub unsafe fn vld3q_s32(a: *const i32) -> int32x4x3_t { vld3q_s32_(a.cast()) } +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3.v1i64.p0i8")] + fn vld3_s64_(ptr: *const i8, size: i32) -> int64x1x3_t; + } +vld3_s64_(a as *const i8, 8) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld3_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3.v1i64.p0v1i64")] + fn vld3_s64_(ptr: *const int64x1_t) -> int64x1x3_t; + } +vld3_s64_(a.cast()) +} + /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon")] @@ -7356,16 +7356,6 @@ pub unsafe fn vld3_u32(a: *const u32) -> uint32x2x3_t { transmute(vld3_s32(transmute(a))) } -/// Load multiple 3-element structures to three registers -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] -pub unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t { - transmute(vld3_s64(transmute(a))) -} - /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon")] @@ -7436,12 +7426,22 @@ pub unsafe fn vld3q_p16(a: *const p16) -> poly16x8x3_t { transmute(vld3q_s16(transmute(a))) } +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld3_u64(a: *const u64) -> uint64x1x3_t { + transmute(vld3_s64(transmute(a))) +} + /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vld3_p64(a: *const p64) -> poly64x1x3_t { transmute(vld3_s64(transmute(a))) } @@ -7586,34 +7586,6 @@ pub unsafe fn vld4_s32(a: *const i32) -> int32x2x4_t { vld4_s32_(a.cast()) } -/// Load multiple 4-element structures to four registers -#[inline] -#[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v1i64.p0i8")] - fn vld4_s64_(ptr: *const i8, size: i32) -> int64x1x4_t; - } -vld4_s64_(a as *const i8, 8) -} - -/// Load multiple 4-element structures to four registers -#[inline] -#[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")] - fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t; - } -vld4_s64_(a.cast()) -} - /// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "arm")] @@ -7698,6 +7670,34 @@ pub unsafe fn vld4q_s32(a: *const i32) -> int32x4x4_t { vld4q_s32_(a.cast()) } +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v1i64.p0i8")] + fn vld4_s64_(ptr: *const i8, size: i32) -> int64x1x4_t; + } +vld4_s64_(a as *const i8, 8) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")] + fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t; + } +vld4_s64_(a.cast()) +} + /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] @@ -7728,16 +7728,6 @@ pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t { transmute(vld4_s32(transmute(a))) } -/// Load multiple 4-element structures to four registers -#[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t { - transmute(vld4_s64(transmute(a))) -} - /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] @@ -7808,12 +7798,22 @@ pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t { transmute(vld4q_s16(transmute(a))) } +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t { + transmute(vld4_s64(transmute(a))) +} + /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { transmute(vld4_s64(transmute(a))) } @@ -23005,14 +23005,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld3_s64() { - let a: [i64; 4] = [0, 1, 2, 2]; - let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; - let r: [i64x1; 3] = transmute(vld3_s64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld3q_s8() { let a: [i8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; @@ -23037,6 +23029,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3_s64() { + let a: [i64; 4] = [0, 1, 2, 2]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 3] = transmute(vld3_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3_u8() { let a: [u8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; @@ -23061,14 +23061,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld3_u64() { - let a: [u64; 4] = [0, 1, 2, 2]; - let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)]; - let r: [u64x1; 3] = transmute(vld3_u64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld3q_u8() { let a: [u8; 49] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; @@ -23125,6 +23117,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3_u64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)]; + let r: [u64x1; 3] = transmute(vld3_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3_p64() { let a: [u64; 4] = [0, 1, 2, 2]; @@ -23173,14 +23173,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld4_s64() { - let a: [i64; 5] = [0, 1, 2, 2, 6]; - let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)]; - let r: [i64x1; 4] = transmute(vld4_s64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld4q_s8() { let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; @@ -23205,6 +23197,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 6]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(6)]; + let r: [i64x1; 4] = transmute(vld4_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4_u8() { let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; @@ -23229,14 +23229,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld4_u64() { - let a: [u64; 5] = [0, 1, 2, 2, 6]; - let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(6)]; - let r: [u64x1; 4] = transmute(vld4_u64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld4q_u8() { let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; @@ -23293,6 +23285,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(6)]; + let r: [u64x1; 4] = transmute(vld4_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4_p64() { let a: [u64; 5] = [0, 1, 2, 2, 6]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 4abb5ad7d2..853f5bc66c 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2131,8 +2131,8 @@ arm = vld2 link-arm = vld2._EXTpi82_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t -arm = vld -aarch64 = ld +arm = nop +aarch64 = nop generate *const i64:int64x1x2_t /// Load multiple 2-element structures to two registers @@ -2153,8 +2153,8 @@ arm = vld2 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t -arm = vld -aarch64 = ld +arm = nop +aarch64 = nop generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t @@ -2168,7 +2168,7 @@ validate 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5., 6., 7., 8., 9. load_fn arm-aarch64-separate -aarch64 = ld +aarch64 = nop link-aarch64 = ld2._EXTv2_ generate *const f64:float64x1x2_t aarch64 = ld2 @@ -2318,8 +2318,11 @@ generate *const i64:int64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ -generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +arm = nop +aarch64 = nop +generate *const i64:int64x1x3_t /// Load multiple 3-element structures to three registers name = vld3 @@ -2336,9 +2339,12 @@ generate *const p64:poly64x2x3_t target = default arm = vld3 -generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +arm = nop +aarch64 = nop +generate *const u64:uint64x1x3_t target = aes generate *const p64:poly64x1x3_t @@ -2350,9 +2356,11 @@ validate 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8. load_fn arm-aarch64-separate -aarch64 = ld3 +aarch64 = nop link-aarch64 = ld3._EXTv2_ -generate *const f64:float64x1x3_t, *const f64:float64x2x3_t +generate *const f64:float64x1x3_t +aarch64 = ld3 +generate *const f64:float64x2x3_t arm = vld3 link-arm = vld3._EXTpi82_ @@ -2495,8 +2503,11 @@ generate *const i64:int64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ -generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +aarch64 = nop +arm = nop +generate *const i64:int64x1x4_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2513,9 +2524,12 @@ generate *const p64:poly64x2x4_t target = default arm = vld4 -generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +aarch64 = nop +arm = nop +generate *const u64:uint64x1x4_t target = aes generate *const p64:poly64x1x4_t @@ -2527,9 +2541,11 @@ validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 15., 6., 8., 8., 16. load_fn arm-aarch64-separate -aarch64 = ld4 +aarch64 = nop link-aarch64 = ld4._EXTv2_ -generate *const f64:float64x1x4_t, *const f64:float64x2x4_t +generate *const f64:float64x1x4_t +aarch64 = ld4 +generate *const f64:float64x2x4_t arm = vld4 link-arm = vld4._EXTpi82_ From fdb938bd76bf4107bcd40ded525264ae9a69f812 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 14:19:20 +0800 Subject: [PATCH 06/28] add vld2_dup neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 95 ++++ .../src/arm_shared/neon/generated.rs | 432 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 22 +- 3 files changed, 538 insertions(+), 11 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index ba83c269a9..5b291c8058 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4647,6 +4647,61 @@ pub unsafe fn vld2q_f64(a: *const f64) -> float64x2x2_t { vld2q_f64_(a.cast()) } +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i64.p0i64")] + fn vld2q_dup_s64_(a: *const i64) -> int64x2x2_t; + } + vld2q_dup_s64_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2r))] +pub unsafe fn vld2q_dup_u64(a: *const u64) -> uint64x2x2_t { + transmute(vld2q_dup_s64(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld2r))] +pub unsafe fn vld2q_dup_p64(a: *const p64) -> poly64x2x2_t { + transmute(vld2q_dup_s64(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2r))] +pub unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1f64.p0f64")] + fn vld2_dup_f64_(a: *const f64) -> float64x1x2_t; + } + vld2_dup_f64_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2r))] +pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f64.p0f64")] + fn vld2q_dup_f64_(a: *const f64) -> float64x2x2_t; + } + vld2q_dup_f64_(a) +} + /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon")] @@ -13266,6 +13321,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_s64() { + let a: [i64; 5] = [0, 1, 1, 2, 3]; + let e: [i64x2; 2] = [i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 2] = transmute(vld2q_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_u64() { + let a: [u64; 5] = [0, 1, 1, 2, 3]; + let e: [u64x2; 2] = [u64x2::new(1, 1), u64x2::new(1, 1)]; + let r: [u64x2; 2] = transmute(vld2q_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_p64() { + let a: [u64; 5] = [0, 1, 1, 2, 3]; + let e: [i64x2; 2] = [i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 2] = transmute(vld2q_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_f64() { + let a: [f64; 3] = [0., 1., 1.]; + let e: [f64; 2] = [1., 1.]; + let r: [f64; 2] = transmute(vld2_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_f64() { + let a: [f64; 5] = [0., 1., 1., 2., 3.]; + let e: [f64x2; 2] = [f64x2::new(1., 1.), f64x2::new(1., 1.)]; + let r: [f64x2; 2] = transmute(vld2q_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3q_s64() { let a: [i64; 7] = [0, 1, 2, 2, 2, 4, 4]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 002fe2d841..fa67a47b4a 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7130,6 +7130,270 @@ pub unsafe fn vld2q_f32(a: *const f32) -> float32x4x2_t { vld2q_f32_(a.cast()) } +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i8.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i8.p0i8")] + fn vld2_dup_s8_(a: *const i8) -> int8x8x2_t; + } +vld2_dup_s8_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i16.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i16.p0i16")] + fn vld2_dup_s16_(a: *const i16) -> int16x4x2_t; + } +vld2_dup_s16_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2i32.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i32.p0i32")] + fn vld2_dup_s32_(a: *const i32) -> int32x2x2_t; + } +vld2_dup_s32_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")] + fn vld2_dup_s64_(a: *const i64) -> int64x1x2_t; + } +vld2_dup_s64_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v16i8.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v16i8.p0i8")] + fn vld2q_dup_s8_(a: *const i8) -> int8x16x2_t; + } +vld2q_dup_s8_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i16.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i16.p0i16")] + fn vld2q_dup_s16_(a: *const i16) -> int16x8x2_t; + } +vld2q_dup_s16_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i32.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i32.p0i32")] + fn vld2q_dup_s32_(a: *const i32) -> int32x4x2_t; + } +vld2q_dup_s32_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { + transmute(vld2_dup_s8(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { + transmute(vld2_dup_s16(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { + transmute(vld2_dup_s32(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t { + transmute(vld2_dup_s64(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t { + transmute(vld2q_dup_s8(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t { + transmute(vld2q_dup_s16(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t { + transmute(vld2q_dup_s32(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t { + transmute(vld2_dup_s8(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t { + transmute(vld2_dup_s16(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t { + transmute(vld2q_dup_s8(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t { + transmute(vld2q_dup_s16(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t { + transmute(vld2_dup_s64(transmute(a))) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2f32.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f32.p0f32")] + fn vld2_dup_f32_(a: *const f32) -> float32x2x2_t; + } +vld2_dup_f32_(a) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4f32.p0i8")] + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4f32.p0f32")] + fn vld2q_dup_f32_(a: *const f32) -> float32x4x2_t; + } +vld2q_dup_f32_(a) +} + /// Load multiple 3-element structures to three registers #[inline] #[cfg(target_arch = "arm")] @@ -22981,6 +23245,174 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_s8() { + let a: [i8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 2] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 2] = transmute(vld2_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_s16() { + let a: [i16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5]; + let e: [i16x4; 2] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 2] = transmute(vld2_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_s32() { + let a: [i32; 5] = [0, 1, 1, 2, 3]; + let e: [i32x2; 2] = [i32x2::new(1, 1), i32x2::new(1, 1)]; + let r: [i32x2; 2] = transmute(vld2_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_s64() { + let a: [i64; 3] = [0, 1, 1]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 2] = transmute(vld2_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_s8() { + let a: [i8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 2] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 2] = transmute(vld2q_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_s16() { + let a: [i16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 2] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 2] = transmute(vld2q_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_s32() { + let a: [i32; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5]; + let e: [i32x4; 2] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)]; + let r: [i32x4; 2] = transmute(vld2q_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_u8() { + let a: [u8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [u8x8; 2] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x8; 2] = transmute(vld2_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_u16() { + let a: [u16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5]; + let e: [u16x4; 2] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)]; + let r: [u16x4; 2] = transmute(vld2_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_u32() { + let a: [u32; 5] = [0, 1, 1, 2, 3]; + let e: [u32x2; 2] = [u32x2::new(1, 1), u32x2::new(1, 1)]; + let r: [u32x2; 2] = transmute(vld2_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_u64() { + let a: [u64; 3] = [0, 1, 1]; + let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(1)]; + let r: [u64x1; 2] = transmute(vld2_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_u8() { + let a: [u8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [u8x16; 2] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x16; 2] = transmute(vld2q_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_u16() { + let a: [u16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [u16x8; 2] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u16x8; 2] = transmute(vld2q_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_u32() { + let a: [u32; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5]; + let e: [u32x4; 2] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)]; + let r: [u32x4; 2] = transmute(vld2q_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_p8() { + let a: [u8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 2] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 2] = transmute(vld2_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_p16() { + let a: [u16; 9] = [0, 1, 1, 2, 3, 1, 4, 3, 5]; + let e: [i16x4; 2] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 2] = transmute(vld2_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_p8() { + let a: [u8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 2] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 2] = transmute(vld2q_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_p16() { + let a: [u16; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 2] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 2] = transmute(vld2q_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_p64() { + let a: [u64; 3] = [0, 1, 1]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 2] = transmute(vld2_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_f32() { + let a: [f32; 5] = [0., 1., 1., 2., 3.]; + let e: [f32x2; 2] = [f32x2::new(1., 1.), f32x2::new(1., 1.)]; + let r: [f32x2; 2] = transmute(vld2_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_dup_f32() { + let a: [f32; 9] = [0., 1., 1., 2., 3., 1., 4., 3., 5.]; + let e: [f32x4; 2] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)]; + let r: [f32x4; 2] = transmute(vld2q_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3_s8() { let a: [i8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 853f5bc66c..1da8a83345 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2187,12 +2187,12 @@ load_fn aarch64 = ld2r link-aarch64 = ld2r._EXT2_ -//generate *const i64:int64x2x2_t +generate *const i64:int64x2x2_t arm = vld2dup link-arm = vld2dup._EXTpi82_ -//generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t -//generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 @@ -2203,17 +2203,17 @@ validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, load_fn aarch64 = ld2r -//generate *const u64:uint64x2x2_t +generate *const u64:uint64x2x2_t target = aes -//generate *const p64:poly64x2x2_t +generate *const p64:poly64x2x2_t target = default arm = vld2dup -//generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t -//generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t -//generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t +generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t target = aes -//generate *const p64:poly64x1x2_t +generate *const p64:poly64x1x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 @@ -2224,11 +2224,11 @@ load_fn aarch64 = ld2r link-aarch64 = ld2r._EXT2_ -//generate *const f64:float64x1x2_t, *const f64:float64x2x2_t +generate *const f64:float64x1x2_t, *const f64:float64x2x2_t arm = vld2dup link-arm = vld2dup._EXTpi82_ -//generate *const f32:float32x2x2_t, *const f32:float32x4x2_t +generate *const f32:float32x2x2_t, *const f32:float32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 From 05d2a25ae89852568ffbf771cb41258b7818eec4 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 14:43:36 +0800 Subject: [PATCH 07/28] correct extern link --- .../core_arch/src/aarch64/neon/generated.rs | 12 +- .../src/arm_shared/neon/generated.rs | 198 ++++++++++++++---- crates/stdarch-gen/neon.spec | 2 + crates/stdarch-gen/src/main.rs | 19 +- 4 files changed, 175 insertions(+), 56 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 5b291c8058..10373c38d8 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4655,9 +4655,9 @@ pub unsafe fn vld2q_dup_s64(a: *const i64) -> int64x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i64.p0i64")] - fn vld2q_dup_s64_(a: *const i64) -> int64x2x2_t; + fn vld2q_dup_s64_(ptr: *const i64) -> int64x2x2_t; } - vld2q_dup_s64_(a) + vld2q_dup_s64_(a.cast()) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -4684,9 +4684,9 @@ pub unsafe fn vld2_dup_f64(a: *const f64) -> float64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1f64.p0f64")] - fn vld2_dup_f64_(a: *const f64) -> float64x1x2_t; + fn vld2_dup_f64_(ptr: *const f64) -> float64x1x2_t; } - vld2_dup_f64_(a) + vld2_dup_f64_(a.cast()) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -4697,9 +4697,9 @@ pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f64.p0f64")] - fn vld2q_dup_f64_(a: *const f64) -> float64x2x2_t; + fn vld2q_dup_f64_(ptr: *const f64) -> float64x2x2_t; } - vld2q_dup_f64_(a) + vld2q_dup_f64_(a.cast()) } /// Load multiple 3-element structures to three registers diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index fa67a47b4a..8ecd44f840 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7132,114 +7132,198 @@ vld2q_f32_(a.cast()) /// Load single 2-element structure and replicate to all lanes of two registers #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i8.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i8.p0i8")] - fn vld2_dup_s8_(a: *const i8) -> int8x8x2_t; + fn vld2_dup_s8_(ptr: *const i8, size: i32) -> int8x8x2_t; } -vld2_dup_s8_(a) +vld2_dup_s8_(a as *const i8, 1) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i8.p0i8")] + fn vld2_dup_s8_(ptr: *const i8) -> int8x8x2_t; + } +vld2_dup_s8_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i16.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i16.p0i16")] - fn vld2_dup_s16_(a: *const i16) -> int16x4x2_t; + fn vld2_dup_s16_(ptr: *const i8, size: i32) -> int16x4x2_t; } -vld2_dup_s16_(a) +vld2_dup_s16_(a as *const i8, 2) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i16.p0i16")] + fn vld2_dup_s16_(ptr: *const i16) -> int16x4x2_t; + } +vld2_dup_s16_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2i32.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i32.p0i32")] - fn vld2_dup_s32_(a: *const i32) -> int32x2x2_t; + fn vld2_dup_s32_(ptr: *const i8, size: i32) -> int32x2x2_t; } -vld2_dup_s32_(a) +vld2_dup_s32_(a as *const i8, 4) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2i32.p0i32")] + fn vld2_dup_s32_(ptr: *const i32) -> int32x2x2_t; + } +vld2_dup_s32_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")] - fn vld2_dup_s64_(a: *const i64) -> int64x1x2_t; + fn vld2_dup_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; } -vld2_dup_s64_(a) +vld2_dup_s64_(a as *const i8, 8) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")] + fn vld2_dup_s64_(ptr: *const i64) -> int64x1x2_t; + } +vld2_dup_s64_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v16i8.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v16i8.p0i8")] - fn vld2q_dup_s8_(a: *const i8) -> int8x16x2_t; + fn vld2q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x2_t; } -vld2q_dup_s8_(a) +vld2q_dup_s8_(a as *const i8, 1) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v16i8.p0i8")] + fn vld2q_dup_s8_(ptr: *const i8) -> int8x16x2_t; + } +vld2q_dup_s8_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v8i16.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i16.p0i16")] - fn vld2q_dup_s16_(a: *const i16) -> int16x8x2_t; + fn vld2q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x2_t; } -vld2q_dup_s16_(a) +vld2q_dup_s16_(a as *const i8, 2) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v8i16.p0i16")] + fn vld2q_dup_s16_(ptr: *const i16) -> int16x8x2_t; + } +vld2q_dup_s16_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4i32.p0i8")] + fn vld2q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x2_t; + } +vld2q_dup_s32_(a as *const i8, 4) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4i32.p0i32")] - fn vld2q_dup_s32_(a: *const i32) -> int32x4x2_t; + fn vld2q_dup_s32_(ptr: *const i32) -> int32x4x2_t; } -vld2q_dup_s32_(a) +vld2q_dup_s32_(a.cast()) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -7364,34 +7448,58 @@ pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t { /// Load single 2-element structure and replicate to all lanes of two registers #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v2f32.p0i8")] - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f32.p0f32")] - fn vld2_dup_f32_(a: *const f32) -> float32x2x2_t; + fn vld2_dup_f32_(ptr: *const i8, size: i32) -> float32x2x2_t; } -vld2_dup_f32_(a) +vld2_dup_f32_(a as *const i8, 4) } /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v2f32.p0f32")] + fn vld2_dup_f32_(ptr: *const f32) -> float32x2x2_t; + } +vld2_dup_f32_(a.cast()) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v4f32.p0i8")] + fn vld2q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x2_t; + } +vld2q_dup_f32_(a as *const i8, 4) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v4f32.p0f32")] - fn vld2q_dup_f32_(a: *const f32) -> float32x4x2_t; + fn vld2q_dup_f32_(ptr: *const f32) -> float32x4x2_t; } -vld2q_dup_f32_(a) +vld2q_dup_f32_(a.cast()) } /// Load multiple 3-element structures to three registers diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 1da8a83345..2ac63f2333 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2184,6 +2184,7 @@ out-dup-nox a = 0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn +arm-aarch64-separate aarch64 = ld2r link-aarch64 = ld2r._EXT2_ @@ -2221,6 +2222,7 @@ out-dup-nox a = 0., 1., 1., 2., 3., 1., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1. load_fn +arm-aarch64-separate aarch64 = ld2r link-aarch64 = ld2r._EXT2_ diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index 0f88daf111..1c518d5b82 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -918,10 +918,9 @@ fn ext(s: &str, in_t: &[&str; 3], out_t: &str) -> String { fn is_vldx(name: &str) -> bool { let s: Vec<_> = name.split('_').collect(); - s.len() == 2 - && &name[0..3] == "vld" + &name[0..3] == "vld" && name[3..4].parse::().unwrap() > 1 - && (s[1].starts_with("s") || s[1].starts_with("f")) + && (s.last().unwrap().starts_with("s") || s.last().unwrap().starts_with("f")) } fn is_vstx(name: &str) -> bool { @@ -1114,8 +1113,13 @@ fn gen_aarch64( }; (format!("{}, ptr: *mut {}", subs, ptr_type), String::new()) } else if is_vldx(&name) { + let ptr_type = if name.contains("dup") { + type_to_native_type(out_t) + } else { + type_to_sub_type(out_t) + }; ( - format!("ptr: *const {}", type_to_sub_type(out_t)), + format!("ptr: *const {}", ptr_type), format!(" -> {}", out_t), ) } else { @@ -1978,8 +1982,13 @@ fn gen_arm( inputs.push_str(&format!(", ptr: *mut {}", ptr_type)); (inputs, String::new()) } else if is_vldx(&name) { + let ptr_type = if name.contains("dup") { + type_to_native_type(out_t) + } else { + type_to_sub_type(out_t) + }; ( - format!("ptr: *const {}", type_to_sub_type(out_t)), + format!("ptr: *const {}", ptr_type), format!(" -> {}", out_t), ) } else { From bfd74055901441049c618a43c475991f7b65a88d Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 14:51:37 +0800 Subject: [PATCH 08/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 42 +++++++++---------- crates/stdarch-gen/neon.spec | 6 +-- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 8ecd44f840..63885189fd 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7134,7 +7134,7 @@ vld2q_f32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2_dup_s8(a: *const i8) -> int8x8x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7162,7 +7162,7 @@ vld2_dup_s8_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2_dup_s16(a: *const i16) -> int16x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7190,7 +7190,7 @@ vld2_dup_s16_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7218,7 +7218,7 @@ vld2_dup_s32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7246,7 +7246,7 @@ vld2_dup_s64_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2q_dup_s8(a: *const i8) -> int8x16x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7274,7 +7274,7 @@ vld2q_dup_s8_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2q_dup_s16(a: *const i16) -> int16x8x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7302,7 +7302,7 @@ vld2q_dup_s16_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2q_dup_s32(a: *const i32) -> int32x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7330,7 +7330,7 @@ vld2q_dup_s32_(a.cast()) #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { transmute(vld2_dup_s8(transmute(a))) @@ -7340,7 +7340,7 @@ pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { transmute(vld2_dup_s16(transmute(a))) @@ -7350,7 +7350,7 @@ pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { transmute(vld2_dup_s32(transmute(a))) @@ -7360,7 +7360,7 @@ pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t { transmute(vld2_dup_s64(transmute(a))) @@ -7370,7 +7370,7 @@ pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t { transmute(vld2q_dup_s8(transmute(a))) @@ -7380,7 +7380,7 @@ pub unsafe fn vld2q_dup_u8(a: *const u8) -> uint8x16x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t { transmute(vld2q_dup_s16(transmute(a))) @@ -7390,7 +7390,7 @@ pub unsafe fn vld2q_dup_u16(a: *const u16) -> uint16x8x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t { transmute(vld2q_dup_s32(transmute(a))) @@ -7400,7 +7400,7 @@ pub unsafe fn vld2q_dup_u32(a: *const u32) -> uint32x4x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t { transmute(vld2_dup_s8(transmute(a))) @@ -7410,7 +7410,7 @@ pub unsafe fn vld2_dup_p8(a: *const p8) -> poly8x8x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t { transmute(vld2_dup_s16(transmute(a))) @@ -7420,7 +7420,7 @@ pub unsafe fn vld2_dup_p16(a: *const p16) -> poly16x4x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t { transmute(vld2q_dup_s8(transmute(a))) @@ -7430,7 +7430,7 @@ pub unsafe fn vld2q_dup_p8(a: *const p8) -> poly8x16x2_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t { transmute(vld2q_dup_s16(transmute(a))) @@ -7440,7 +7440,7 @@ pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t { #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t { transmute(vld2_dup_s64(transmute(a))) @@ -7450,7 +7450,7 @@ pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t { #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2_dup_f32(a: *const f32) -> float32x2x2_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -7478,7 +7478,7 @@ vld2_dup_f32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 2ac63f2333..10dabc0741 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2190,7 +2190,7 @@ aarch64 = ld2r link-aarch64 = ld2r._EXT2_ generate *const i64:int64x2x2_t -arm = vld2dup +arm = vld2 link-arm = vld2dup._EXTpi82_ generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t @@ -2209,7 +2209,7 @@ target = aes generate *const p64:poly64x2x2_t target = default -arm = vld2dup +arm = vld2 generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t @@ -2228,7 +2228,7 @@ aarch64 = ld2r link-aarch64 = ld2r._EXT2_ generate *const f64:float64x1x2_t, *const f64:float64x2x2_t -arm = vld2dup +arm = vld2 link-arm = vld2dup._EXTpi82_ generate *const f32:float32x2x2_t, *const f32:float32x4x2_t From ce67eec4dd577d219345400b978ec31c5b490182 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 14:56:09 +0800 Subject: [PATCH 09/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 110 +++++++++--------- crates/stdarch-gen/neon.spec | 8 +- 2 files changed, 61 insertions(+), 57 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 63885189fd..fcdc140272 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7214,34 +7214,6 @@ pub unsafe fn vld2_dup_s32(a: *const i32) -> int32x2x2_t { vld2_dup_s32_(a.cast()) } -/// Load single 2-element structure and replicate to all lanes of two registers -#[inline] -#[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] -pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0i8")] - fn vld2_dup_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; - } -vld2_dup_s64_(a as *const i8, 8) -} - -/// Load single 2-element structure and replicate to all lanes of two registers -#[inline] -#[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] -pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")] - fn vld2_dup_s64_(ptr: *const i64) -> int64x1x2_t; - } -vld2_dup_s64_(a.cast()) -} - /// Load single 2-element structure and replicate to all lanes of two registers #[inline] #[cfg(target_arch = "arm")] @@ -7328,12 +7300,30 @@ vld2q_dup_s32_(a.cast()) /// Load single 2-element structure and replicate to all lanes of two registers #[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2dup.v1i64.p0i8")] + fn vld2_dup_s64_(ptr: *const i8, size: i32) -> int64x1x2_t; + } +vld2_dup_s64_(a as *const i8, 8) +} + +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] -pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { - transmute(vld2_dup_s8(transmute(a))) +pub unsafe fn vld2_dup_s64(a: *const i64) -> int64x1x2_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2r.v1i64.p0i64")] + fn vld2_dup_s64_(ptr: *const i64) -> int64x1x2_t; + } +vld2_dup_s64_(a.cast()) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -7342,8 +7332,8 @@ pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] -pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { - transmute(vld2_dup_s16(transmute(a))) +pub unsafe fn vld2_dup_u8(a: *const u8) -> uint8x8x2_t { + transmute(vld2_dup_s8(transmute(a))) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -7352,8 +7342,8 @@ pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] -pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { - transmute(vld2_dup_s32(transmute(a))) +pub unsafe fn vld2_dup_u16(a: *const u16) -> uint16x4x2_t { + transmute(vld2_dup_s16(transmute(a))) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -7362,8 +7352,8 @@ pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] -pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t { - transmute(vld2_dup_s64(transmute(a))) +pub unsafe fn vld2_dup_u32(a: *const u32) -> uint32x2x2_t { + transmute(vld2_dup_s32(transmute(a))) } /// Load single 2-element structure and replicate to all lanes of two registers @@ -7436,11 +7426,21 @@ pub unsafe fn vld2q_dup_p16(a: *const p16) -> poly16x8x2_t { transmute(vld2q_dup_s16(transmute(a))) } +/// Load single 2-element structure and replicate to all lanes of two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] +pub unsafe fn vld2_dup_u64(a: *const u64) -> uint64x1x2_t { + transmute(vld2_dup_s64(transmute(a))) +} + /// Load single 2-element structure and replicate to all lanes of two registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2r))] pub unsafe fn vld2_dup_p64(a: *const p64) -> poly64x1x2_t { transmute(vld2_dup_s64(transmute(a))) @@ -23377,14 +23377,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld2_dup_s64() { - let a: [i64; 3] = [0, 1, 1]; - let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)]; - let r: [i64x1; 2] = transmute(vld2_dup_s64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld2q_dup_s8() { let a: [i8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; @@ -23409,6 +23401,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_s64() { + let a: [i64; 3] = [0, 1, 1]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 2] = transmute(vld2_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld2_dup_u8() { let a: [u8; 17] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9]; @@ -23433,14 +23433,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vld2_dup_u64() { - let a: [u64; 3] = [0, 1, 1]; - let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(1)]; - let r: [u64x1; 2] = transmute(vld2_dup_u64(a[1..].as_ptr())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vld2q_dup_u8() { let a: [u8; 33] = [0, 1, 1, 2, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; @@ -23497,6 +23489,14 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_dup_u64() { + let a: [u64; 3] = [0, 1, 1]; + let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(1)]; + let r: [u64x1; 2] = transmute(vld2_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld2_dup_p64() { let a: [u64; 3] = [0, 1, 1]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 10dabc0741..624f4e378a 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2192,8 +2192,10 @@ generate *const i64:int64x2x2_t arm = vld2 link-arm = vld2dup._EXTpi82_ -generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t, *const i64:int64x1x2_t +generate *const i8:int8x8x2_t, *const i16:int16x4x2_t, *const i32:int32x2x2_t generate *const i8:int8x16x2_t, *const i16:int16x8x2_t, *const i32:int32x4x2_t +arm = nop +generate *const i64:int64x1x2_t /// Load single 2-element structure and replicate to all lanes of two registers name = vld2 @@ -2210,9 +2212,11 @@ generate *const p64:poly64x2x2_t target = default arm = vld2 -generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t, *const u64:uint64x1x2_t +generate *const u8:uint8x8x2_t, *const u16:uint16x4x2_t, *const u32:uint32x2x2_t generate *const u8:uint8x16x2_t, *const u16:uint16x8x2_t, *const u32:uint32x4x2_t generate *const p8:poly8x8x2_t, *const p16:poly16x4x2_t, *const p8:poly8x16x2_t, *const p16:poly16x8x2_t +arm = nop +generate *const u64:uint64x1x2_t target = aes generate *const p64:poly64x1x2_t From f0d41aaf28042977770243613ce8d6d9bbc34724 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 15:13:10 +0800 Subject: [PATCH 10/28] add vld3_dup and vld4_dup neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 190 +++ .../src/arm_shared/neon/generated.rs | 1256 +++++++++++++++-- crates/stdarch-gen/neon.spec | 62 +- 3 files changed, 1395 insertions(+), 113 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 10373c38d8..77ecc3ac55 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4757,6 +4757,61 @@ pub unsafe fn vld3q_f64(a: *const f64) -> float64x2x3_t { vld3q_f64_(a.cast()) } +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3r))] +pub unsafe fn vld3q_dup_s64(a: *const i64) -> int64x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2i64.p0i64")] + fn vld3q_dup_s64_(ptr: *const i64) -> int64x2x3_t; + } + vld3q_dup_s64_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3r))] +pub unsafe fn vld3q_dup_u64(a: *const u64) -> uint64x2x3_t { + transmute(vld3q_dup_s64(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld3r))] +pub unsafe fn vld3q_dup_p64(a: *const p64) -> poly64x2x3_t { + transmute(vld3q_dup_s64(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3r))] +pub unsafe fn vld3_dup_f64(a: *const f64) -> float64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v1f64.p0f64")] + fn vld3_dup_f64_(ptr: *const f64) -> float64x1x3_t; + } + vld3_dup_f64_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3r))] +pub unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2f64.p0f64")] + fn vld3q_dup_f64_(ptr: *const f64) -> float64x2x3_t; + } + vld3q_dup_f64_(a.cast()) +} + /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] @@ -4812,6 +4867,61 @@ pub unsafe fn vld4q_f64(a: *const f64) -> float64x2x4_t { vld4q_f64_(a.cast()) } +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4r))] +pub unsafe fn vld4q_dup_s64(a: *const i64) -> int64x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2i64.p0i64")] + fn vld4q_dup_s64_(ptr: *const i64) -> int64x2x4_t; + } + vld4q_dup_s64_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u64(a: *const u64) -> uint64x2x4_t { + transmute(vld4q_dup_s64(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld4r))] +pub unsafe fn vld4q_dup_p64(a: *const p64) -> poly64x2x4_t { + transmute(vld4q_dup_s64(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4r))] +pub unsafe fn vld4_dup_f64(a: *const f64) -> float64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v1f64.p0f64")] + fn vld4_dup_f64_(ptr: *const f64) -> float64x1x4_t; + } + vld4_dup_f64_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4r))] +pub unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f64.p0f64")] + fn vld4q_dup_f64_(ptr: *const f64) -> float64x2x4_t; + } + vld4q_dup_f64_(a.cast()) +} + /// Store multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -13401,6 +13511,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_s64() { + let a: [i64; 7] = [0, 1, 1, 1, 3, 1, 4]; + let e: [i64x2; 3] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 3] = transmute(vld3q_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_u64() { + let a: [u64; 7] = [0, 1, 1, 1, 3, 1, 4]; + let e: [u64x2; 3] = [u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1)]; + let r: [u64x2; 3] = transmute(vld3q_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_p64() { + let a: [u64; 7] = [0, 1, 1, 1, 3, 1, 4]; + let e: [i64x2; 3] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 3] = transmute(vld3q_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_f64() { + let a: [f64; 4] = [0., 1., 1., 1.]; + let e: [f64; 3] = [1., 1., 1.]; + let r: [f64; 3] = transmute(vld3_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_f64() { + let a: [f64; 7] = [0., 1., 1., 1., 3., 1., 4.]; + let e: [f64x2; 3] = [f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.)]; + let r: [f64x2; 3] = transmute(vld3q_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4q_s64() { let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; @@ -13441,6 +13591,46 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_s64() { + let a: [i64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5]; + let e: [i64x2; 4] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 4] = transmute(vld4q_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_u64() { + let a: [u64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5]; + let e: [u64x2; 4] = [u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1), u64x2::new(1, 1)]; + let r: [u64x2; 4] = transmute(vld4q_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_p64() { + let a: [u64; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5]; + let e: [i64x2; 4] = [i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1), i64x2::new(1, 1)]; + let r: [i64x2; 4] = transmute(vld4q_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_f64() { + let a: [f64; 5] = [0., 1., 1., 1., 1.]; + let e: [f64; 4] = [1., 1., 1., 1.]; + let r: [f64; 4] = transmute(vld4_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_f64() { + let a: [f64; 9] = [0., 1., 1., 1., 1., 6., 4., 3., 5.]; + let e: [f64x2; 4] = [f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.), f64x2::new(1., 1.)]; + let r: [f64x2; 4] = transmute(vld4q_dup_f64(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_f64_x2() { let a: [f64; 3] = [0., 1., 2.]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index fcdc140272..e31d4eed3d 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7874,6 +7874,378 @@ pub unsafe fn vld3q_f32(a: *const f32) -> float32x4x3_t { vld3q_f32_(a.cast()) } +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i8.p0i8")] + fn vld3_dup_s8_(ptr: *const i8, size: i32) -> int8x8x3_t; + } +vld3_dup_s8_(a as *const i8, 1) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_s8(a: *const i8) -> int8x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v8i8.p0i8")] + fn vld3_dup_s8_(ptr: *const i8) -> int8x8x3_t; + } +vld3_dup_s8_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i16.p0i8")] + fn vld3_dup_s16_(ptr: *const i8, size: i32) -> int16x4x3_t; + } +vld3_dup_s16_(a as *const i8, 2) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_s16(a: *const i16) -> int16x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4i16.p0i16")] + fn vld3_dup_s16_(ptr: *const i16) -> int16x4x3_t; + } +vld3_dup_s16_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2i32.p0i8")] + fn vld3_dup_s32_(ptr: *const i8, size: i32) -> int32x2x3_t; + } +vld3_dup_s32_(a as *const i8, 4) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_s32(a: *const i32) -> int32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2i32.p0i32")] + fn vld3_dup_s32_(ptr: *const i32) -> int32x2x3_t; + } +vld3_dup_s32_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v16i8.p0i8")] + fn vld3q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x3_t; + } +vld3q_dup_s8_(a as *const i8, 1) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_s8(a: *const i8) -> int8x16x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v16i8.p0i8")] + fn vld3q_dup_s8_(ptr: *const i8) -> int8x16x3_t; + } +vld3q_dup_s8_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v8i16.p0i8")] + fn vld3q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x3_t; + } +vld3q_dup_s16_(a as *const i8, 2) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_s16(a: *const i16) -> int16x8x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v8i16.p0i16")] + fn vld3q_dup_s16_(ptr: *const i16) -> int16x8x3_t; + } +vld3q_dup_s16_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4i32.p0i8")] + fn vld3q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x3_t; + } +vld3q_dup_s32_(a as *const i8, 4) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_s32(a: *const i32) -> int32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4i32.p0i32")] + fn vld3q_dup_s32_(ptr: *const i32) -> int32x4x3_t; + } +vld3q_dup_s32_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v1i64.p0i8")] + fn vld3_dup_s64_(ptr: *const i8, size: i32) -> int64x1x3_t; + } +vld3_dup_s64_(a as *const i8, 8) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_s64(a: *const i64) -> int64x1x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v1i64.p0i64")] + fn vld3_dup_s64_(ptr: *const i64) -> int64x1x3_t; + } +vld3_dup_s64_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_u8(a: *const u8) -> uint8x8x3_t { + transmute(vld3_dup_s8(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_u16(a: *const u16) -> uint16x4x3_t { + transmute(vld3_dup_s16(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_u32(a: *const u32) -> uint32x2x3_t { + transmute(vld3_dup_s32(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_u8(a: *const u8) -> uint8x16x3_t { + transmute(vld3q_dup_s8(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_u16(a: *const u16) -> uint16x8x3_t { + transmute(vld3q_dup_s16(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_u32(a: *const u32) -> uint32x4x3_t { + transmute(vld3q_dup_s32(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_p8(a: *const p8) -> poly8x8x3_t { + transmute(vld3_dup_s8(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_p16(a: *const p16) -> poly16x4x3_t { + transmute(vld3_dup_s16(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_p8(a: *const p8) -> poly8x16x3_t { + transmute(vld3q_dup_s8(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_p16(a: *const p16) -> poly16x8x3_t { + transmute(vld3q_dup_s16(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_u64(a: *const u64) -> uint64x1x3_t { + transmute(vld3_dup_s64(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_p64(a: *const p64) -> poly64x1x3_t { + transmute(vld3_dup_s64(transmute(a))) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v2f32.p0i8")] + fn vld3_dup_f32_(ptr: *const i8, size: i32) -> float32x2x3_t; + } +vld3_dup_f32_(a as *const i8, 4) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3_dup_f32(a: *const f32) -> float32x2x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v2f32.p0f32")] + fn vld3_dup_f32_(ptr: *const f32) -> float32x2x3_t; + } +vld3_dup_f32_(a.cast()) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3))] +pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3dup.v4f32.p0i8")] + fn vld3q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x3_t; + } +vld3q_dup_f32_(a as *const i8, 4) +} + +/// Load single 3-element structure and replicate to all lanes of three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3r))] +pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3r.v4f32.p0f32")] + fn vld3q_dup_f32_(ptr: *const f32) -> float32x4x3_t; + } +vld3q_dup_f32_(a.cast()) +} + /// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "arm")] @@ -8056,194 +8428,566 @@ pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { vld4_s64_(a as *const i8, 8) } -/// Load multiple 4-element structures to four registers +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")] + fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t; + } +vld4_s64_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t { + transmute(vld4_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t { + transmute(vld4_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t { + transmute(vld4_s32(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t { + transmute(vld4q_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t { + transmute(vld4q_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t { + transmute(vld4q_s32(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t { + transmute(vld4_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t { + transmute(vld4_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t { + transmute(vld4q_s8(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t { + transmute(vld4q_s16(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t { + transmute(vld4_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { + transmute(vld4_s64(transmute(a))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2f32.p0i8")] + fn vld4_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; + } +vld4_f32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f32.p0v2f32")] + fn vld4_f32_(ptr: *const float32x2_t) -> float32x2x4_t; + } +vld4_f32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f32.p0i8")] + fn vld4q_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; + } +vld4q_f32_(a as *const i8, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] +pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4f32.p0v4f32")] + fn vld4q_f32_(ptr: *const float32x4_t) -> float32x4x4_t; + } +vld4q_f32_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i8.p0i8")] + fn vld4_dup_s8_(ptr: *const i8, size: i32) -> int8x8x4_t; + } +vld4_dup_s8_(a as *const i8, 1) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v8i8.p0i8")] + fn vld4_dup_s8_(ptr: *const i8) -> int8x8x4_t; + } +vld4_dup_s8_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i16.p0i8")] + fn vld4_dup_s16_(ptr: *const i8, size: i32) -> int16x4x4_t; + } +vld4_dup_s16_(a as *const i8, 2) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4i16.p0i16")] + fn vld4_dup_s16_(ptr: *const i16) -> int16x4x4_t; + } +vld4_dup_s16_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2i32.p0i8")] + fn vld4_dup_s32_(ptr: *const i8, size: i32) -> int32x2x4_t; + } +vld4_dup_s32_(a as *const i8, 4) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2i32.p0i32")] + fn vld4_dup_s32_(ptr: *const i32) -> int32x2x4_t; + } +vld4_dup_s32_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v16i8.p0i8")] + fn vld4q_dup_s8_(ptr: *const i8, size: i32) -> int8x16x4_t; + } +vld4q_dup_s8_(a as *const i8, 1) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v16i8.p0i8")] + fn vld4q_dup_s8_(ptr: *const i8) -> int8x16x4_t; + } +vld4q_dup_s8_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v8i16.p0i8")] + fn vld4q_dup_s16_(ptr: *const i8, size: i32) -> int16x8x4_t; + } +vld4q_dup_s16_(a as *const i8, 2) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v8i16.p0i16")] + fn vld4q_dup_s16_(ptr: *const i16) -> int16x8x4_t; + } +vld4q_dup_s16_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4i32.p0i8")] + fn vld4q_dup_s32_(ptr: *const i8, size: i32) -> int32x4x4_t; + } +vld4q_dup_s32_(a as *const i8, 4) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4i32.p0i32")] + fn vld4q_dup_s32_(ptr: *const i32) -> int32x4x4_t; + } +vld4q_dup_s32_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v1i64.p0i8")] + fn vld4_dup_s64_(ptr: *const i8, size: i32) -> int64x1x4_t; + } +vld4_dup_s64_(a as *const i8, 8) +} + +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] -pub unsafe fn vld4_s64(a: *const i64) -> int64x1x4_t { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_s64(a: *const i64) -> int64x1x4_t { #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v1i64.p0v1i64")] - fn vld4_s64_(ptr: *const int64x1_t) -> int64x1x4_t; + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v1i64.p0i64")] + fn vld4_dup_s64_(ptr: *const i64) -> int64x1x4_t; } -vld4_s64_(a.cast()) +vld4_dup_s64_(a.cast()) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_u8(a: *const u8) -> uint8x8x4_t { - transmute(vld4_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t { + transmute(vld4_dup_s8(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_u16(a: *const u16) -> uint16x4x4_t { - transmute(vld4_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t { + transmute(vld4_dup_s16(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_u32(a: *const u32) -> uint32x2x4_t { - transmute(vld4_s32(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t { + transmute(vld4_dup_s32(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_u8(a: *const u8) -> uint8x16x4_t { - transmute(vld4q_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t { + transmute(vld4q_dup_s8(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_u16(a: *const u16) -> uint16x8x4_t { - transmute(vld4q_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t { + transmute(vld4q_dup_s16(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_u32(a: *const u32) -> uint32x4x4_t { - transmute(vld4q_s32(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t { + transmute(vld4q_dup_s32(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_p8(a: *const p8) -> poly8x8x4_t { - transmute(vld4_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t { + transmute(vld4_dup_s8(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_p16(a: *const p16) -> poly16x4x4_t { - transmute(vld4_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t { + transmute(vld4_dup_s16(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_p8(a: *const p8) -> poly8x16x4_t { - transmute(vld4q_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t { + transmute(vld4q_dup_s8(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_p16(a: *const p16) -> poly16x8x4_t { - transmute(vld4q_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t { + transmute(vld4q_dup_s16(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] -pub unsafe fn vld4_u64(a: *const u64) -> uint64x1x4_t { - transmute(vld4_s64(transmute(a))) +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t { + transmute(vld4_dup_s64(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] -pub unsafe fn vld4_p64(a: *const p64) -> poly64x1x4_t { - transmute(vld4_s64(transmute(a))) +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t { + transmute(vld4_dup_s64(transmute(a))) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v2f32.p0i8")] - fn vld4_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2f32.p0i8")] + fn vld4_dup_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; } -vld4_f32_(a as *const i8, 4) +vld4_dup_f32_(a as *const i8, 4) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4_f32(a: *const f32) -> float32x2x4_t { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v2f32.p0v2f32")] - fn vld4_f32_(ptr: *const float32x2_t) -> float32x2x4_t; + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f32.p0f32")] + fn vld4_dup_f32_(ptr: *const f32) -> float32x2x4_t; } -vld4_f32_(a.cast()) +vld4_dup_f32_(a.cast()) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4.v4f32.p0i8")] - fn vld4q_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f32.p0i8")] + fn vld4q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; } -vld4q_f32_(a as *const i8, 4) +vld4q_dup_f32_(a as *const i8, 4) } -/// Load multiple 4-element structures to four registers +/// Load single 4-element structure and replicate to all lanes of four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4))] -pub unsafe fn vld4q_f32(a: *const f32) -> float32x4x4_t { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4.v4f32.p0v4f32")] - fn vld4q_f32_(ptr: *const float32x4_t) -> float32x4x4_t; + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4f32.p0f32")] + fn vld4q_dup_f32_(ptr: *const f32) -> float32x4x4_t; } -vld4q_f32_(a.cast()) +vld4q_dup_f32_(a.cast()) } /// Store multiple single-element structures from one, two, three, or four registers @@ -23689,6 +24433,174 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_s8() { + let a: [i8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [i8x8; 3] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 3] = transmute(vld3_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_s16() { + let a: [i16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7]; + let e: [i16x4; 3] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 3] = transmute(vld3_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_s32() { + let a: [i32; 7] = [0, 1, 1, 1, 3, 1, 4]; + let e: [i32x2; 3] = [i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1)]; + let r: [i32x2; 3] = transmute(vld3_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_s8() { + let a: [i8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 3] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 3] = transmute(vld3q_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_s16() { + let a: [i16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [i16x8; 3] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 3] = transmute(vld3q_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_s32() { + let a: [i32; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7]; + let e: [i32x4; 3] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)]; + let r: [i32x4; 3] = transmute(vld3q_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_s64() { + let a: [i64; 4] = [0, 1, 1, 1]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 3] = transmute(vld3_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_u8() { + let a: [u8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [u8x8; 3] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x8; 3] = transmute(vld3_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_u16() { + let a: [u16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7]; + let e: [u16x4; 3] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)]; + let r: [u16x4; 3] = transmute(vld3_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_u32() { + let a: [u32; 7] = [0, 1, 1, 1, 3, 1, 4]; + let e: [u32x2; 3] = [u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1)]; + let r: [u32x2; 3] = transmute(vld3_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_u8() { + let a: [u8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [u8x16; 3] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x16; 3] = transmute(vld3q_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_u16() { + let a: [u16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [u16x8; 3] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u16x8; 3] = transmute(vld3q_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_u32() { + let a: [u32; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7]; + let e: [u32x4; 3] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)]; + let r: [u32x4; 3] = transmute(vld3q_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_p8() { + let a: [u8; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [i8x8; 3] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 3] = transmute(vld3_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_p16() { + let a: [u16; 13] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7]; + let e: [i16x4; 3] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 3] = transmute(vld3_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_p8() { + let a: [u8; 49] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17]; + let e: [i8x16; 3] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 3] = transmute(vld3q_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_p16() { + let a: [u16; 25] = [0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13]; + let e: [i16x8; 3] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 3] = transmute(vld3q_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_u64() { + let a: [u64; 4] = [0, 1, 1, 1]; + let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(1), u64x1::new(1)]; + let r: [u64x1; 3] = transmute(vld3_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_p64() { + let a: [u64; 4] = [0, 1, 1, 1]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 3] = transmute(vld3_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_dup_f32() { + let a: [f32; 7] = [0., 1., 1., 1., 3., 1., 4.]; + let e: [f32x2; 3] = [f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.)]; + let r: [f32x2; 3] = transmute(vld3_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_dup_f32() { + let a: [f32; 13] = [0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5.]; + let e: [f32x4; 3] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)]; + let r: [f32x4; 3] = transmute(vld3q_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4_s8() { let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; @@ -23857,6 +24769,174 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_s8() { + let a: [i8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 4] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 4] = transmute(vld4_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_s16() { + let a: [i16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x4; 4] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 4] = transmute(vld4_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_s32() { + let a: [i32; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5]; + let e: [i32x2; 4] = [i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1), i32x2::new(1, 1)]; + let r: [i32x2; 4] = transmute(vld4_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_s8() { + let a: [i8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x16; 4] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 4] = transmute(vld4q_dup_s8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_s16() { + let a: [i16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 4] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 4] = transmute(vld4q_dup_s16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_s32() { + let a: [i32; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i32x4; 4] = [i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1), i32x4::new(1, 1, 1, 1)]; + let r: [i32x4; 4] = transmute(vld4q_dup_s32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_s64() { + let a: [i64; 5] = [0, 1, 1, 1, 1]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(1), i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 4] = transmute(vld4_dup_s64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_u8() { + let a: [u8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [u8x8; 4] = [u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1), u8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x8; 4] = transmute(vld4_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_u16() { + let a: [u16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [u16x4; 4] = [u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1), u16x4::new(1, 1, 1, 1)]; + let r: [u16x4; 4] = transmute(vld4_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_u32() { + let a: [u32; 9] = [0, 1, 1, 1, 1, 2, 4, 3, 5]; + let e: [u32x2; 4] = [u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1), u32x2::new(1, 1)]; + let r: [u32x2; 4] = transmute(vld4_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_u8() { + let a: [u8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [u8x16; 4] = [u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), u8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u8x16; 4] = transmute(vld4q_dup_u8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_u16() { + let a: [u16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [u16x8; 4] = [u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1), u16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [u16x8; 4] = transmute(vld4q_dup_u16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_u32() { + let a: [u32; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [u32x4; 4] = [u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1), u32x4::new(1, 1, 1, 1)]; + let r: [u32x4; 4] = transmute(vld4q_dup_u32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_p8() { + let a: [u8; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x8; 4] = [i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1), i8x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x8; 4] = transmute(vld4_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_p16() { + let a: [u16; 17] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x4; 4] = [i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1), i16x4::new(1, 1, 1, 1)]; + let r: [i16x4; 4] = transmute(vld4_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_p8() { + let a: [u8; 65] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i8x16; 4] = [i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1), i8x16::new(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i8x16; 4] = transmute(vld4q_dup_p8(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_p16() { + let a: [u16; 33] = [0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9]; + let e: [i16x8; 4] = [i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1), i16x8::new(1, 1, 1, 1, 1, 1, 1, 1)]; + let r: [i16x8; 4] = transmute(vld4q_dup_p16(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_u64() { + let a: [u64; 5] = [0, 1, 1, 1, 1]; + let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(1), u64x1::new(1), u64x1::new(1)]; + let r: [u64x1; 4] = transmute(vld4_dup_u64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_p64() { + let a: [u64; 5] = [0, 1, 1, 1, 1]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(1), i64x1::new(1), i64x1::new(1)]; + let r: [i64x1; 4] = transmute(vld4_dup_p64(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_dup_f32() { + let a: [f32; 9] = [0., 1., 1., 1., 1., 6., 4., 3., 5.]; + let e: [f32x2; 4] = [f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.), f32x2::new(1., 1.)]; + let r: [f32x2; 4] = transmute(vld4_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_dup_f32() { + let a: [f32; 17] = [0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5.]; + let e: [f32x4; 4] = [f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.), f32x4::new(1., 1., 1., 1.)]; + let r: [f32x4; 4] = transmute(vld4q_dup_f32(a[1..].as_ptr())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_s8_x2() { let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 624f4e378a..ed53e037e9 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2378,15 +2378,18 @@ out-dup-nox a = 0, 1, 1, 1, 3, 1, 4, 3, 5, 1, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17, 6, 14, 7, 15, 8, 16, 9, 17 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn +arm-aarch64-separate aarch64 = ld3r link-aarch64 = ld3r._EXT2_ -//generate *const i64:int64x2x3_t +generate *const i64:int64x2x3_t -arm = vld3dup +arm = vld3 link-arm = vld3dup._EXTpi82_ -//generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t, *const i64:int64x1x3_t -//generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +generate *const i8:int8x8x3_t, *const i16:int16x4x3_t, *const i32:int32x2x3_t +generate *const i8:int8x16x3_t, *const i16:int16x8x3_t, *const i32:int32x4x3_t +arm = nop +generate *const i64:int64x1x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 @@ -2397,17 +2400,19 @@ validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, load_fn aarch64 = ld3r -//generate *const u64:uint64x2x3_t +generate *const u64:uint64x2x3_t target = aes -//generate *const p64:poly64x2x3_t +generate *const p64:poly64x2x3_t target = default -arm = vld3dup -//generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t, *const u64:uint64x1x3_t -//generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t -//generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +arm = vld3 +generate *const u8:uint8x8x3_t, *const u16:uint16x4x3_t, *const u32:uint32x2x3_t +generate *const u8:uint8x16x3_t, *const u16:uint16x8x3_t, *const u32:uint32x4x3_t +generate *const p8:poly8x8x3_t, *const p16:poly16x4x3_t, *const p8:poly8x16x3_t, *const p16:poly16x8x3_t +arm = nop +generate *const u64:uint64x1x3_t target = aes -//generate *const p64:poly64x1x3_t +generate *const p64:poly64x1x3_t /// Load single 3-element structure and replicate to all lanes of three registers name = vld3 @@ -2415,14 +2420,15 @@ out-dup-nox a = 0., 1., 1., 1., 3., 1., 4., 3., 5., 1., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. load_fn +arm-aarch64-separate aarch64 = ld3r link-aarch64 = ld3r._EXT2_ -//generate *const f64:float64x1x3_t, *const f64:float64x2x3_t +generate *const f64:float64x1x3_t, *const f64:float64x2x3_t -arm = vld3dup +arm = vld3 link-arm = vld3dup._EXTpi82_ -//generate *const f32:float32x2x3_t, *const f32:float32x4x3_t +generate *const f32:float32x2x3_t, *const f32:float32x4x3_t /// Load multiple 3-element structures to two registers name = vld3 @@ -2563,15 +2569,18 @@ out-dup-nox a = 0, 1, 1, 1, 1, 2, 4, 3, 5, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9, 8, 6, 3, 7, 4, 8, 5, 9 validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 load_fn +arm-aarch64-separate aarch64 = ld4r link-aarch64 = ld4r._EXT2_ -//generate *const i64:int64x2x4_t +generate *const i64:int64x2x4_t arm = vld4dup link-arm = vld4dup._EXTpi82_ -//generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t, *const i64:int64x1x4_t -//generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t +generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t +arm = nop +generate *const i64:int64x1x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 @@ -2582,17 +2591,19 @@ validate 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, load_fn aarch64 = ld4r -//generate *const u64:uint64x2x4_t +generate *const u64:uint64x2x4_t target = aes -//generate *const p64:poly64x2x4_t +generate *const p64:poly64x2x4_t target = default arm = vld4dup -//generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t, *const u64:uint64x1x4_t -//generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t -//generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t +generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t +generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t +arm = nop +generate *const u64:uint64x1x4_t target = aes -//generate *const p64:poly64x1x4_t +generate *const p64:poly64x1x4_t /// Load single 4-element structure and replicate to all lanes of four registers name = vld4 @@ -2600,14 +2611,15 @@ out-dup-nox a = 0., 1., 1., 1., 1., 6., 4., 3., 5., 7., 4., 3., 5., 8., 4., 3., 5., 9., 4., 3., 5. validate 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1. load_fn +arm-aarch64-separate aarch64 = ld4r link-aarch64 = ld4r._EXT2_ -//generate *const f64:float64x1x4_t, *const f64:float64x2x4_t +generate *const f64:float64x1x4_t, *const f64:float64x2x4_t arm = vld4dup link-arm = vld4dup._EXTpi82_ -//generate *const f32:float32x2x4_t, *const f32:float32x4x4_t +generate *const f32:float32x2x4_t, *const f32:float32x4x4_t /// Load multiple 4-element structures to four registers name = vld4 From f5ea0b7072e7111d63cf32daf9cb681e586cd287 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 15:17:52 +0800 Subject: [PATCH 11/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 36 +++++++++---------- crates/stdarch-gen/neon.spec | 6 ++-- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index e31d4eed3d..8f87413c5b 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -8622,7 +8622,7 @@ vld4q_f32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4_dup_s8(a: *const i8) -> int8x8x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8650,7 +8650,7 @@ vld4_dup_s8_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4_dup_s16(a: *const i16) -> int16x4x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8678,7 +8678,7 @@ vld4_dup_s16_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4_dup_s32(a: *const i32) -> int32x2x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8706,7 +8706,7 @@ vld4_dup_s32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4q_dup_s8(a: *const i8) -> int8x16x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8734,7 +8734,7 @@ vld4q_dup_s8_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4q_dup_s16(a: *const i16) -> int16x8x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8762,7 +8762,7 @@ vld4q_dup_s16_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4q_dup_s32(a: *const i32) -> int32x4x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8818,7 +8818,7 @@ vld4_dup_s64_(a.cast()) #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t { transmute(vld4_dup_s8(transmute(a))) @@ -8828,7 +8828,7 @@ pub unsafe fn vld4_dup_u8(a: *const u8) -> uint8x8x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t { transmute(vld4_dup_s16(transmute(a))) @@ -8838,7 +8838,7 @@ pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t { transmute(vld4_dup_s32(transmute(a))) @@ -8848,7 +8848,7 @@ pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t { transmute(vld4q_dup_s8(transmute(a))) @@ -8858,7 +8858,7 @@ pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t { transmute(vld4q_dup_s16(transmute(a))) @@ -8868,7 +8868,7 @@ pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t { transmute(vld4q_dup_s32(transmute(a))) @@ -8878,7 +8878,7 @@ pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t { transmute(vld4_dup_s8(transmute(a))) @@ -8888,7 +8888,7 @@ pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t { transmute(vld4_dup_s16(transmute(a))) @@ -8898,7 +8898,7 @@ pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t { transmute(vld4q_dup_s8(transmute(a))) @@ -8908,7 +8908,7 @@ pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t { transmute(vld4q_dup_s16(transmute(a))) @@ -8938,7 +8938,7 @@ pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t { #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { #[allow(improper_ctypes)] extern "unadjusted" { @@ -8966,7 +8966,7 @@ vld4_dup_f32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4dup))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index ed53e037e9..a1058e6952 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2575,7 +2575,7 @@ aarch64 = ld4r link-aarch64 = ld4r._EXT2_ generate *const i64:int64x2x4_t -arm = vld4dup +arm = vld4 link-arm = vld4dup._EXTpi82_ generate *const i8:int8x8x4_t, *const i16:int16x4x4_t, *const i32:int32x2x4_t generate *const i8:int8x16x4_t, *const i16:int16x8x4_t, *const i32:int32x4x4_t @@ -2596,7 +2596,7 @@ target = aes generate *const p64:poly64x2x4_t target = default -arm = vld4dup +arm = vld4 generate *const u8:uint8x8x4_t, *const u16:uint16x4x4_t, *const u32:uint32x2x4_t generate *const u8:uint8x16x4_t, *const u16:uint16x8x4_t, *const u32:uint32x4x4_t generate *const p8:poly8x8x4_t, *const p16:poly16x4x4_t, *const p8:poly8x16x4_t, *const p16:poly16x8x4_t @@ -2617,7 +2617,7 @@ aarch64 = ld4r link-aarch64 = ld4r._EXT2_ generate *const f64:float64x1x4_t, *const f64:float64x2x4_t -arm = vld4dup +arm = vld4 link-arm = vld4dup._EXTpi82_ generate *const f32:float32x2x4_t, *const f32:float32x4x4_t From 20b98369ff691fa86acce5858a636fc361d3c8af Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 15:49:35 +0800 Subject: [PATCH 12/28] add vld2_lane neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 234 +++++++++ .../src/arm_shared/neon/generated.rs | 455 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 24 +- 3 files changed, 701 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 77ecc3ac55..e85917de60 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4702,6 +4702,141 @@ pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t { vld2q_dup_f64_(a.cast()) } +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s8(a: *const i8, b: int8x16x2_t) -> int8x16x2_t { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v16i8.p0i8")] + fn vld2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *const i8) -> int8x16x2_t; + } + vld2q_lane_s8_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s64(a: *const i64, b: int64x1x2_t) -> int64x1x2_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1i64.p0i8")] + fn vld2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *const i8) -> int64x1x2_t; + } + vld2_lane_s64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s64(a: *const i64, b: int64x2x2_t) -> int64x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2i64.p0i8")] + fn vld2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *const i8) -> int64x2x2_t; + } + vld2q_lane_s64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_p64(a: *const p64, b: poly64x1x2_t) -> poly64x1x2_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld2_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_p64(a: *const p64, b: poly64x2x2_t) -> poly64x2x2_t { + static_assert_imm1!(LANE); + transmute(vld2q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_u8(a: *const u8, b: uint8x16x2_t) -> uint8x16x2_t { + static_assert_imm4!(LANE); + transmute(vld2q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_u64(a: *const u64, b: uint64x1x2_t) -> uint64x1x2_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld2_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_u64(a: *const u64, b: uint64x2x2_t) -> uint64x2x2_t { + static_assert_imm1!(LANE); + transmute(vld2q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_p8(a: *const p8, b: poly8x16x2_t) -> poly8x16x2_t { + static_assert_imm4!(LANE); + transmute(vld2q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_f64(a: *const f64, b: float64x1x2_t) -> float64x1x2_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v1f64.p0i8")] + fn vld2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *const i8) -> float64x1x2_t; + } + vld2_lane_f64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_f64(a: *const f64, b: float64x2x2_t) -> float64x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2f64.p0i8")] + fn vld2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *const i8) -> float64x2x2_t; + } + vld2q_lane_f64_(b.0, b.1, LANE as i64, a.cast()) +} + /// Load multiple 3-element structures to three registers #[inline] #[target_feature(enable = "neon")] @@ -13471,6 +13606,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_s8() { + let a: [i8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x16; 2] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x16; 2] = transmute(vld2q_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_s64() { + let a: [i64; 3] = [0, 1, 2]; + let b: [i64x1; 2] = [i64x1::new(0), i64x1::new(2)]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld2_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_s64() { + let a: [i64; 5] = [0, 1, 2, 3, 4]; + let b: [i64x2; 2] = [i64x2::new(0, 2), i64x2::new(2, 14)]; + let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 14)]; + let r: [i64x2; 2] = transmute(vld2q_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_p64() { + let a: [u64; 3] = [0, 1, 2]; + let b: [i64x1; 2] = [i64x1::new(0), i64x1::new(2)]; + let e: [i64x1; 2] = [i64x1::new(1), i64x1::new(2)]; + let r: [i64x1; 2] = transmute(vld2_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_p64() { + let a: [u64; 5] = [0, 1, 2, 3, 4]; + let b: [i64x2; 2] = [i64x2::new(0, 2), i64x2::new(2, 14)]; + let e: [i64x2; 2] = [i64x2::new(1, 2), i64x2::new(2, 14)]; + let r: [i64x2; 2] = transmute(vld2q_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_u8() { + let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u8x16; 2] = [u8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let e: [u8x16; 2] = [u8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let r: [u8x16; 2] = transmute(vld2q_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_u64() { + let a: [u64; 3] = [0, 1, 2]; + let b: [u64x1; 2] = [u64x1::new(0), u64x1::new(2)]; + let e: [u64x1; 2] = [u64x1::new(1), u64x1::new(2)]; + let r: [u64x1; 2] = transmute(vld2_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_u64() { + let a: [u64; 5] = [0, 1, 2, 3, 4]; + let b: [u64x2; 2] = [u64x2::new(0, 2), u64x2::new(2, 14)]; + let e: [u64x2; 2] = [u64x2::new(1, 2), u64x2::new(2, 14)]; + let r: [u64x2; 2] = transmute(vld2q_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_p8() { + let a: [u8; 33] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x16; 2] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x16; 2] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x16; 2] = transmute(vld2q_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let b: [f64; 2] = [0., 2.]; + let e: [f64; 2] = [1., 2.]; + let r: [f64; 2] = transmute(vld2_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_f64() { + let a: [f64; 5] = [0., 1., 2., 3., 4.]; + let b: [f64x2; 2] = [f64x2::new(0., 2.), f64x2::new(2., 14.)]; + let e: [f64x2; 2] = [f64x2::new(1., 2.), f64x2::new(2., 14.)]; + let r: [f64x2; 2] = transmute(vld2q_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3q_s64() { let a: [i64; 7] = [0, 1, 2, 2, 2, 4, 4]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 8f87413c5b..2bdab49296 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7502,6 +7502,326 @@ pub unsafe fn vld2q_dup_f32(a: *const f32) -> float32x4x2_t { vld2q_dup_f32_(a.cast()) } +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8x8x2_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i8.p0i8")] + fn vld2_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32); + } +vld2_lane_s8_(a.cast(), b.0, b.1, LANE, 1) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8x8x2_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v8i8.p0i8")] + fn vld2_lane_s8_(a: int8x8_t, b: int8x8_t, n: i64, ptr: *const i8) -> int8x8x2_t; + } +vld2_lane_s8_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> int16x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i16.p0i8")] + fn vld2_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32); + } +vld2_lane_s16_(a.cast(), b.0, b.1, LANE, 2) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> int16x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4i16.p0i8")] + fn vld2_lane_s16_(a: int16x4_t, b: int16x4_t, n: i64, ptr: *const i8) -> int16x4x2_t; + } +vld2_lane_s16_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> int32x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2i32.p0i8")] + fn vld2_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32); + } +vld2_lane_s32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> int32x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2i32.p0i8")] + fn vld2_lane_s32_(a: int32x2_t, b: int32x2_t, n: i64, ptr: *const i8) -> int32x2x2_t; + } +vld2_lane_s32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> int16x8x2_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i16.p0i8")] + fn vld2q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32); + } +vld2q_lane_s16_(a.cast(), b.0, b.1, LANE, 2) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> int16x8x2_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v8i16.p0i8")] + fn vld2q_lane_s16_(a: int16x8_t, b: int16x8_t, n: i64, ptr: *const i8) -> int16x8x2_t; + } +vld2q_lane_s16_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> int32x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i32.p0i8")] + fn vld2q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32); + } +vld2q_lane_s32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> int32x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4i32.p0i8")] + fn vld2q_lane_s32_(a: int32x4_t, b: int32x4_t, n: i64, ptr: *const i8) -> int32x4x2_t; + } +vld2q_lane_s32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t { + static_assert_imm3!(LANE); + transmute(vld2_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t { + static_assert_imm2!(LANE); + transmute(vld2_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t { + static_assert_imm1!(LANE); + transmute(vld2_lane_s32::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t { + static_assert_imm3!(LANE); + transmute(vld2q_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t { + static_assert_imm2!(LANE); + transmute(vld2q_lane_s32::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t { + static_assert_imm3!(LANE); + transmute(vld2_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t { + static_assert_imm2!(LANE); + transmute(vld2_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_p16(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t { + static_assert_imm3!(LANE); + transmute(vld2q_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> float32x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2f32.p0i8")] + fn vld2_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32); + } +vld2_lane_f32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> float32x2x2_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v2f32.p0i8")] + fn vld2_lane_f32_(a: float32x2_t, b: float32x2_t, n: i64, ptr: *const i8) -> float32x2x2_t; + } +vld2_lane_f32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) -> float32x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4f32.p0i8")] + fn vld2q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32); + } +vld2q_lane_f32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Load multiple 2-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) -> float32x4x2_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld2lane.v4f32.p0i8")] + fn vld2q_lane_f32_(a: float32x4_t, b: float32x4_t, n: i64, ptr: *const i8) -> float32x4x2_t; + } +vld2q_lane_f32_(b.0, b.1, LANE as i64, a.cast()) +} + /// Load multiple 3-element structures to three registers #[inline] #[cfg(target_arch = "arm")] @@ -24265,6 +24585,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_s8() { + let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 2] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x8; 2] = transmute(vld2_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_s16() { + let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x4; 2] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18)]; + let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18)]; + let r: [i16x4; 2] = transmute(vld2_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_s32() { + let a: [i32; 5] = [0, 1, 2, 3, 4]; + let b: [i32x2; 2] = [i32x2::new(0, 2), i32x2::new(2, 14)]; + let e: [i32x2; 2] = [i32x2::new(1, 2), i32x2::new(2, 14)]; + let r: [i32x2; 2] = transmute(vld2_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_s16() { + let a: [i16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 2] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i16x8; 2] = transmute(vld2q_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_s32() { + let a: [i32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i32x4; 2] = [i32x4::new(0, 2, 2, 14), i32x4::new(2, 16, 17, 18)]; + let e: [i32x4; 2] = [i32x4::new(1, 2, 2, 14), i32x4::new(2, 16, 17, 18)]; + let r: [i32x4; 2] = transmute(vld2q_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_u8() { + let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u8x8; 2] = [u8x8::new(0, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [u8x8; 2] = [u8x8::new(1, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [u8x8; 2] = transmute(vld2_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_u16() { + let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u16x4; 2] = [u16x4::new(0, 2, 2, 14), u16x4::new(2, 16, 17, 18)]; + let e: [u16x4; 2] = [u16x4::new(1, 2, 2, 14), u16x4::new(2, 16, 17, 18)]; + let r: [u16x4; 2] = transmute(vld2_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_u32() { + let a: [u32; 5] = [0, 1, 2, 3, 4]; + let b: [u32x2; 2] = [u32x2::new(0, 2), u32x2::new(2, 14)]; + let e: [u32x2; 2] = [u32x2::new(1, 2), u32x2::new(2, 14)]; + let r: [u32x2; 2] = transmute(vld2_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_u16() { + let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u16x8; 2] = [u16x8::new(0, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [u16x8; 2] = [u16x8::new(1, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [u16x8; 2] = transmute(vld2q_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_u32() { + let a: [u32; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u32x4; 2] = [u32x4::new(0, 2, 2, 14), u32x4::new(2, 16, 17, 18)]; + let e: [u32x4; 2] = [u32x4::new(1, 2, 2, 14), u32x4::new(2, 16, 17, 18)]; + let r: [u32x4; 2] = transmute(vld2q_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_p8() { + let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 2] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x8; 2] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x8; 2] = transmute(vld2_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_p16() { + let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x4; 2] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18)]; + let e: [i16x4; 2] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18)]; + let r: [i16x4; 2] = transmute(vld2_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_p16() { + let a: [u16; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 2] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i16x8; 2] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i16x8; 2] = transmute(vld2q_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2_lane_f32() { + let a: [f32; 5] = [0., 1., 2., 3., 4.]; + let b: [f32x2; 2] = [f32x2::new(0., 2.), f32x2::new(2., 14.)]; + let e: [f32x2; 2] = [f32x2::new(1., 2.), f32x2::new(2., 14.)]; + let r: [f32x2; 2] = transmute(vld2_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld2q_lane_f32() { + let a: [f32; 9] = [0., 1., 2., 3., 4., 5., 6., 7., 8.]; + let b: [f32x4; 2] = [f32x4::new(0., 2., 2., 14.), f32x4::new(2., 16., 17., 18.)]; + let e: [f32x4; 2] = [f32x4::new(1., 2., 2., 14.), f32x4::new(2., 16., 17., 18.)]; + let r: [f32x4; 2] = transmute(vld2q_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld3_s8() { let a: [i8; 25] = [0, 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index a1058e6952..8deecc564d 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2251,13 +2251,13 @@ arm-aarch64-separate aarch64 = ld2lane const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ -//generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t +generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t arm = vld2lane const-arm = LANE link-arm = vld2lane._EXTpi82_ -//generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t -//generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t +generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t +generate *const i16:int16x8x2_t:int16x8x2_t, *const i32:int32x4x2_t:int32x4x2_t /// Load multiple 2-element structures to two registers name = vld2 @@ -2275,18 +2275,18 @@ aarch64 = ld2lane const-aarch64 = LANE target = aes -//generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t +generate *const p64:poly64x1x2_t:poly64x1x2_t, *const p64:poly64x2x2_t:poly64x2x2_t target = default -//generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t -//generate *const p8:poly8x16x2_t:poly8x16x2_t +generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t +generate *const p8:poly8x16x2_t:poly8x16x2_t arm = vld2lane const-arm = LANE -//generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t -//generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t -//generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t -//generate *const p16:poly16x8x2_t:poly16x8x2_t +generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t +generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t +generate *const p8:poly8x8x2_t:poly8x8x2_t, *const p16:poly16x4x2_t:poly16x4x2_t +generate *const p16:poly16x8x2_t:poly16x8x2_t /// Load multiple 2-element structures to two registers name = vld2 @@ -2303,12 +2303,12 @@ arm-aarch64-separate aarch64 = ld2lane const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ -//generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t +generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t arm = vld2lane const-arm = LANE link-arm = vld2lane._EXTpi82_ -//generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t +generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t /// Load multiple 3-element structures to three registers name = vld3 From 3a2917d5782881216bc017f1041c6fd931e5b553 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 15:55:20 +0800 Subject: [PATCH 13/28] correct extern return type --- crates/core_arch/src/arm_shared/neon/generated.rs | 14 +++++++------- crates/stdarch-gen/src/main.rs | 7 ++++++- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 2bdab49296..2434291b6e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7513,7 +7513,7 @@ pub unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8 #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i8.p0i8")] - fn vld2_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32); + fn vld2_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32) -> int8x8x2_t; } vld2_lane_s8_(a.cast(), b.0, b.1, LANE, 1) } @@ -7545,7 +7545,7 @@ pub unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> i #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i16.p0i8")] - fn vld2_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32); + fn vld2_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32) -> int16x4x2_t; } vld2_lane_s16_(a.cast(), b.0, b.1, LANE, 2) } @@ -7577,7 +7577,7 @@ pub unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> i #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2i32.p0i8")] - fn vld2_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32); + fn vld2_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32) -> int32x2x2_t; } vld2_lane_s32_(a.cast(), b.0, b.1, LANE, 4) } @@ -7609,7 +7609,7 @@ pub unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v8i16.p0i8")] - fn vld2q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32); + fn vld2q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32) -> int16x8x2_t; } vld2q_lane_s16_(a.cast(), b.0, b.1, LANE, 2) } @@ -7641,7 +7641,7 @@ pub unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4i32.p0i8")] - fn vld2q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32); + fn vld2q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32) -> int32x4x2_t; } vld2q_lane_s32_(a.cast(), b.0, b.1, LANE, 4) } @@ -7769,7 +7769,7 @@ pub unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v2f32.p0i8")] - fn vld2_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32); + fn vld2_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32) -> float32x2x2_t; } vld2_lane_f32_(a.cast(), b.0, b.1, LANE, 4) } @@ -7801,7 +7801,7 @@ pub unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) - #[allow(improper_ctypes)] extern "unadjusted" { #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld2lane.v4f32.p0i8")] - fn vld2q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32); + fn vld2q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32) -> float32x4x2_t; } vld2q_lane_f32_(a.cast(), b.0, b.1, LANE, 4) } diff --git a/crates/stdarch-gen/src/main.rs b/crates/stdarch-gen/src/main.rs index 1c518d5b82..ec76d5639b 100644 --- a/crates/stdarch-gen/src/main.rs +++ b/crates/stdarch-gen/src/main.rs @@ -1832,9 +1832,14 @@ fn gen_arm( ), _ => panic!("unknown type: {}", in_t[1]), }; + let out = if out_t == "void" { + String::new() + } else { + format!(" -> {}", out_t) + }; ( format!("ptr: {}, {}, n: i32, size: i32", ptr_type, inputs), - String::new(), + out, ) } else { let (_, const_type) = if const_arm.contains(":") { From e5647d7c6877d83d3b1d6eaab873a5c2c62e7e1b Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:00:46 +0800 Subject: [PATCH 14/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 30 +++++++++---------- crates/stdarch-gen/neon.spec | 6 ++-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 2434291b6e..543e4d44e3 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7506,7 +7506,7 @@ vld2q_dup_f32_(a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8x8x2_t { static_assert_imm3!(LANE); @@ -7538,7 +7538,7 @@ vld2_lane_s8_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> int16x4x2_t { static_assert_imm2!(LANE); @@ -7570,7 +7570,7 @@ vld2_lane_s16_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> int32x2x2_t { static_assert_imm1!(LANE); @@ -7602,7 +7602,7 @@ vld2_lane_s32_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> int16x8x2_t { static_assert_imm3!(LANE); @@ -7634,7 +7634,7 @@ vld2q_lane_s16_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> int32x4x2_t { static_assert_imm2!(LANE); @@ -7666,7 +7666,7 @@ vld2q_lane_s32_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t { @@ -7678,7 +7678,7 @@ pub unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uin #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t { @@ -7690,7 +7690,7 @@ pub unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t { @@ -7702,7 +7702,7 @@ pub unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t { @@ -7714,7 +7714,7 @@ pub unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t { @@ -7726,7 +7726,7 @@ pub unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t { @@ -7738,7 +7738,7 @@ pub unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> pol #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t { @@ -7750,7 +7750,7 @@ pub unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_p16(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t { @@ -7762,7 +7762,7 @@ pub unsafe fn vld2q_lane_p16(a: *const p16, b: poly16x8x2_t) -> #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> float32x2x2_t { static_assert_imm1!(LANE); @@ -7794,7 +7794,7 @@ vld2_lane_f32_(b.0, b.1, LANE as i64, a.cast()) #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) -> float32x4x2_t { static_assert_imm2!(LANE); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 8deecc564d..7e02763eac 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2253,7 +2253,7 @@ const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t -arm = vld2lane +arm = vld2 const-arm = LANE link-arm = vld2lane._EXTpi82_ generate *const i8:int8x8x2_t:int8x8x2_t, *const i16:int16x4x2_t:int16x4x2_t, *const i32:int32x2x2_t:int32x2x2_t @@ -2281,7 +2281,7 @@ target = default generate *const u8:uint8x16x2_t:uint8x16x2_t, *const u64:uint64x1x2_t:uint64x1x2_t, *const u64:uint64x2x2_t:uint64x2x2_t generate *const p8:poly8x16x2_t:poly8x16x2_t -arm = vld2lane +arm = vld2 const-arm = LANE generate *const u8:uint8x8x2_t:uint8x8x2_t, *const u16:uint16x4x2_t:uint16x4x2_t, *const u32:uint32x2x2_t:uint32x2x2_t generate *const u16:uint16x8x2_t:uint16x8x2_t, *const u32:uint32x4x2_t:uint32x4x2_t @@ -2305,7 +2305,7 @@ const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t -arm = vld2lane +arm = vld2 const-arm = LANE link-arm = vld2lane._EXTpi82_ generate *const f32:float32x2x2_t:float32x2x2_t, *const f32:float32x4x2_t:float32x4x2_t From d16fcbce9d7f62ac8c2a23246fe548af529fc999 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:07:00 +0800 Subject: [PATCH 15/28] correct assert_instr --- .../core_arch/src/aarch64/neon/generated.rs | 18 ++++++------- .../src/arm_shared/neon/generated.rs | 26 +++++++++---------- crates/stdarch-gen/neon.spec | 4 +-- 3 files changed, 24 insertions(+), 24 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index e85917de60..6e1ffb6e46 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4705,7 +4705,7 @@ pub unsafe fn vld2q_dup_f64(a: *const f64) -> float64x2x2_t { /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s8(a: *const i8, b: int8x16x2_t) -> int8x16x2_t { static_assert_imm4!(LANE); @@ -4720,7 +4720,7 @@ pub unsafe fn vld2q_lane_s8(a: *const i8, b: int8x16x2_t) -> in /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s64(a: *const i64, b: int64x1x2_t) -> int64x1x2_t { static_assert!(LANE : i32 where LANE == 0); @@ -4735,7 +4735,7 @@ pub unsafe fn vld2_lane_s64(a: *const i64, b: int64x1x2_t) -> i /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s64(a: *const i64, b: int64x2x2_t) -> int64x2x2_t { static_assert_imm1!(LANE); @@ -4750,7 +4750,7 @@ pub unsafe fn vld2q_lane_s64(a: *const i64, b: int64x2x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon,aes")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_p64(a: *const p64, b: poly64x1x2_t) -> poly64x1x2_t { static_assert!(LANE : i32 where LANE == 0); @@ -4760,7 +4760,7 @@ pub unsafe fn vld2_lane_p64(a: *const p64, b: poly64x1x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon,aes")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_p64(a: *const p64, b: poly64x2x2_t) -> poly64x2x2_t { static_assert_imm1!(LANE); @@ -4770,7 +4770,7 @@ pub unsafe fn vld2q_lane_p64(a: *const p64, b: poly64x2x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u8(a: *const u8, b: uint8x16x2_t) -> uint8x16x2_t { static_assert_imm4!(LANE); @@ -4780,7 +4780,7 @@ pub unsafe fn vld2q_lane_u8(a: *const u8, b: uint8x16x2_t) -> u /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u64(a: *const u64, b: uint64x1x2_t) -> uint64x1x2_t { static_assert!(LANE : i32 where LANE == 0); @@ -4790,7 +4790,7 @@ pub unsafe fn vld2_lane_u64(a: *const u64, b: uint64x1x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u64(a: *const u64, b: uint64x2x2_t) -> uint64x2x2_t { static_assert_imm1!(LANE); @@ -4800,7 +4800,7 @@ pub unsafe fn vld2q_lane_u64(a: *const u64, b: uint64x2x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_p8(a: *const p8, b: poly8x16x2_t) -> poly8x16x2_t { static_assert_imm4!(LANE); diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 543e4d44e3..c3e645d7b7 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7522,7 +7522,7 @@ vld2_lane_s8_(a.cast(), b.0, b.1, LANE, 1) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s8(a: *const i8, b: int8x8x2_t) -> int8x8x2_t { static_assert_imm3!(LANE); @@ -7554,7 +7554,7 @@ vld2_lane_s16_(a.cast(), b.0, b.1, LANE, 2) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s16(a: *const i16, b: int16x4x2_t) -> int16x4x2_t { static_assert_imm2!(LANE); @@ -7586,7 +7586,7 @@ vld2_lane_s32_(a.cast(), b.0, b.1, LANE, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_s32(a: *const i32, b: int32x2x2_t) -> int32x2x2_t { static_assert_imm1!(LANE); @@ -7618,7 +7618,7 @@ vld2q_lane_s16_(a.cast(), b.0, b.1, LANE, 2) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s16(a: *const i16, b: int16x8x2_t) -> int16x8x2_t { static_assert_imm3!(LANE); @@ -7650,7 +7650,7 @@ vld2q_lane_s32_(a.cast(), b.0, b.1, LANE, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_s32(a: *const i32, b: int32x4x2_t) -> int32x4x2_t { static_assert_imm2!(LANE); @@ -7667,7 +7667,7 @@ vld2q_lane_s32_(b.0, b.1, LANE as i64, a.cast()) #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uint8x8x2_t { static_assert_imm3!(LANE); @@ -7679,7 +7679,7 @@ pub unsafe fn vld2_lane_u8(a: *const u8, b: uint8x8x2_t) -> uin #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> uint16x4x2_t { static_assert_imm2!(LANE); @@ -7691,7 +7691,7 @@ pub unsafe fn vld2_lane_u16(a: *const u16, b: uint16x4x2_t) -> #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> uint32x2x2_t { static_assert_imm1!(LANE); @@ -7703,7 +7703,7 @@ pub unsafe fn vld2_lane_u32(a: *const u32, b: uint32x2x2_t) -> #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> uint16x8x2_t { static_assert_imm3!(LANE); @@ -7715,7 +7715,7 @@ pub unsafe fn vld2q_lane_u16(a: *const u16, b: uint16x8x2_t) -> #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> uint32x4x2_t { static_assert_imm2!(LANE); @@ -7727,7 +7727,7 @@ pub unsafe fn vld2q_lane_u32(a: *const u32, b: uint32x4x2_t) -> #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> poly8x8x2_t { static_assert_imm3!(LANE); @@ -7739,7 +7739,7 @@ pub unsafe fn vld2_lane_p8(a: *const p8, b: poly8x8x2_t) -> pol #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> poly16x4x2_t { static_assert_imm2!(LANE); @@ -7751,7 +7751,7 @@ pub unsafe fn vld2_lane_p16(a: *const p16, b: poly16x4x2_t) -> #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld2, LANE = 0))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_p16(a: *const p16, b: poly16x8x2_t) -> poly16x8x2_t { static_assert_imm3!(LANE); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 7e02763eac..26a7f57023 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2248,7 +2248,7 @@ validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 1 load_fn arm-aarch64-separate -aarch64 = ld2lane +aarch64 = ld2 const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const i8:int8x16x2_t:int8x16x2_t, *const i64:int64x1x2_t:int64x1x2_t, *const i64:int64x2x2_t:int64x2x2_t @@ -2271,7 +2271,7 @@ n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26 load_fn -aarch64 = ld2lane +aarch64 = ld2 const-aarch64 = LANE target = aes From f178022bc3a36c929d64363ba4e0600e013f41ed Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:13:52 +0800 Subject: [PATCH 16/28] correct assert_instr --- crates/core_arch/src/aarch64/neon/generated.rs | 4 ++-- crates/core_arch/src/arm_shared/neon/generated.rs | 4 ++-- crates/stdarch-gen/neon.spec | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 6e1ffb6e46..369dce1cc8 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4810,7 +4810,7 @@ pub unsafe fn vld2q_lane_p8(a: *const p8, b: poly8x16x2_t) -> p /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_f64(a: *const f64, b: float64x1x2_t) -> float64x1x2_t { static_assert!(LANE : i32 where LANE == 0); @@ -4825,7 +4825,7 @@ pub unsafe fn vld2_lane_f64(a: *const f64, b: float64x1x2_t) -> /// Load multiple 2-element structures to two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(test, assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_f64(a: *const f64, b: float64x2x2_t) -> float64x2x2_t { static_assert_imm1!(LANE); diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index c3e645d7b7..fa74451671 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -7778,7 +7778,7 @@ vld2_lane_f32_(a.cast(), b.0, b.1, LANE, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2_lane_f32(a: *const f32, b: float32x2x2_t) -> float32x2x2_t { static_assert_imm1!(LANE); @@ -7810,7 +7810,7 @@ vld2q_lane_f32_(a.cast(), b.0, b.1, LANE, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2lane, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld2, LANE = 0))] #[rustc_legacy_const_generics(2)] pub unsafe fn vld2q_lane_f32(a: *const f32, b: float32x4x2_t) -> float32x4x2_t { static_assert_imm2!(LANE); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 26a7f57023..1b0250876f 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2300,7 +2300,7 @@ validate 1., 2., 2., 14., 2., 16., 17., 18. load_fn arm-aarch64-separate -aarch64 = ld2lane +aarch64 = ld2 const-aarch64 = LANE link-aarch64 = ld2lane._EXTpi82_ generate *const f64:float64x1x2_t:float64x1x2_t, *const f64:float64x2x2_t:float64x2x2_t From 3839a744afb3a968d615842477c5146d5c0dc7d6 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:27:04 +0800 Subject: [PATCH 17/28] add vld3_lane and vld4_lane neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 468 ++++++++ .../src/arm_shared/neon/generated.rs | 1068 +++++++++++++++-- crates/stdarch-gen/neon.spec | 68 +- 3 files changed, 1491 insertions(+), 113 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 369dce1cc8..477d614b30 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -4947,6 +4947,141 @@ pub unsafe fn vld3q_dup_f64(a: *const f64) -> float64x2x3_t { vld3q_dup_f64_(a.cast()) } +/// Load multiple 3-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s8(a: *const i8, b: int8x16x3_t) -> int8x16x3_t { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v16i8.p0i8")] + fn vld3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *const i8) -> int8x16x3_t; + } + vld3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s64(a: *const i64, b: int64x1x3_t) -> int64x1x3_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1i64.p0i8")] + fn vld3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *const i8) -> int64x1x3_t; + } + vld3_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s64(a: *const i64, b: int64x2x3_t) -> int64x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2i64.p0i8")] + fn vld3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *const i8) -> int64x2x3_t; + } + vld3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_p64(a: *const p64, b: poly64x1x3_t) -> poly64x1x3_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld3_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_p64(a: *const p64, b: poly64x2x3_t) -> poly64x2x3_t { + static_assert_imm1!(LANE); + transmute(vld3q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_p8(a: *const p8, b: poly8x16x3_t) -> poly8x16x3_t { + static_assert_imm4!(LANE); + transmute(vld3q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_u8(a: *const u8, b: uint8x16x3_t) -> uint8x16x3_t { + static_assert_imm4!(LANE); + transmute(vld3q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_u64(a: *const u64, b: uint64x1x3_t) -> uint64x1x3_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld3_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_u64(a: *const u64, b: uint64x2x3_t) -> uint64x2x3_t { + static_assert_imm1!(LANE); + transmute(vld3q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_f64(a: *const f64, b: float64x1x3_t) -> float64x1x3_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v1f64.p0i8")] + fn vld3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *const i8) -> float64x1x3_t; + } + vld3_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_f64(a: *const f64, b: float64x2x3_t) -> float64x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2f64.p0i8")] + fn vld3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *const i8) -> float64x2x3_t; + } + vld3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + /// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] @@ -5057,6 +5192,141 @@ pub unsafe fn vld4q_dup_f64(a: *const f64) -> float64x2x4_t { vld4q_dup_f64_(a.cast()) } +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s8(a: *const i8, b: int8x16x4_t) -> int8x16x4_t { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v16i8.p0i8")] + fn vld4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *const i8) -> int8x16x4_t; + } + vld4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s64(a: *const i64, b: int64x1x4_t) -> int64x1x4_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1i64.p0i8")] + fn vld4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *const i8) -> int64x1x4_t; + } + vld4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s64(a: *const i64, b: int64x2x4_t) -> int64x2x4_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2i64.p0i8")] + fn vld4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *const i8) -> int64x2x4_t; + } + vld4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_p64(a: *const p64, b: poly64x1x4_t) -> poly64x1x4_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld4_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_p64(a: *const p64, b: poly64x2x4_t) -> poly64x2x4_t { + static_assert_imm1!(LANE); + transmute(vld4q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_p8(a: *const p8, b: poly8x16x4_t) -> poly8x16x4_t { + static_assert_imm4!(LANE); + transmute(vld4q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_u8(a: *const u8, b: uint8x16x4_t) -> uint8x16x4_t { + static_assert_imm4!(LANE); + transmute(vld4q_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_u64(a: *const u64, b: uint64x1x4_t) -> uint64x1x4_t { + static_assert!(LANE : i32 where LANE == 0); + transmute(vld4_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_u64(a: *const u64, b: uint64x2x4_t) -> uint64x2x4_t { + static_assert_imm1!(LANE); + transmute(vld4q_lane_s64::(transmute(a), transmute(b))) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_f64(a: *const f64, b: float64x1x4_t) -> float64x1x4_t { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v1f64.p0i8")] + fn vld4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *const i8) -> float64x1x4_t; + } + vld4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_f64(a: *const f64, b: float64x2x4_t) -> float64x2x4_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2f64.p0i8")] + fn vld4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *const i8) -> float64x2x4_t; + } + vld4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + /// Store multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -13785,6 +14055,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_s8() { + let a: [i8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x16; 3] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let r: [i8x16; 3] = transmute(vld3q_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_s64() { + let a: [i64; 4] = [0, 1, 2, 2]; + let b: [i64x1; 3] = [i64x1::new(0), i64x1::new(2), i64x1::new(2)]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 3] = transmute(vld3_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_s64() { + let a: [i64; 7] = [0, 1, 2, 2, 4, 5, 6]; + let b: [i64x2; 3] = [i64x2::new(0, 2), i64x2::new(2, 14), i64x2::new(2, 16)]; + let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 14), i64x2::new(2, 16)]; + let r: [i64x2; 3] = transmute(vld3q_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_p64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let b: [i64x1; 3] = [i64x1::new(0), i64x1::new(2), i64x1::new(2)]; + let e: [i64x1; 3] = [i64x1::new(1), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 3] = transmute(vld3_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_p64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 5, 6]; + let b: [i64x2; 3] = [i64x2::new(0, 2), i64x2::new(2, 14), i64x2::new(2, 16)]; + let e: [i64x2; 3] = [i64x2::new(1, 2), i64x2::new(2, 14), i64x2::new(2, 16)]; + let r: [i64x2; 3] = transmute(vld3q_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_p8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x16; 3] = [i8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let e: [i8x16; 3] = [i8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let r: [i8x16; 3] = transmute(vld3q_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_u8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u8x16; 3] = [u8x16::new(0, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let e: [u8x16; 3] = [u8x16::new(1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8)]; + let r: [u8x16; 3] = transmute(vld3q_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_u64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let b: [u64x1; 3] = [u64x1::new(0), u64x1::new(2), u64x1::new(2)]; + let e: [u64x1; 3] = [u64x1::new(1), u64x1::new(2), u64x1::new(2)]; + let r: [u64x1; 3] = transmute(vld3_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_u64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 5, 6]; + let b: [u64x2; 3] = [u64x2::new(0, 2), u64x2::new(2, 14), u64x2::new(2, 16)]; + let e: [u64x2; 3] = [u64x2::new(1, 2), u64x2::new(2, 14), u64x2::new(2, 16)]; + let r: [u64x2; 3] = transmute(vld3q_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_f64() { + let a: [f64; 4] = [0., 1., 2., 2.]; + let b: [f64; 3] = [0., 2., 2.]; + let e: [f64; 3] = [1., 2., 2.]; + let r: [f64; 3] = transmute(vld3_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_f64() { + let a: [f64; 7] = [0., 1., 2., 2., 4., 5., 6.]; + let b: [f64x2; 3] = [f64x2::new(0., 2.), f64x2::new(2., 14.), f64x2::new(9., 16.)]; + let e: [f64x2; 3] = [f64x2::new(1., 2.), f64x2::new(2., 14.), f64x2::new(2., 16.)]; + let r: [f64x2; 3] = transmute(vld3q_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4q_s64() { let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; @@ -13865,6 +14234,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_s8() { + let a: [i8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16]; + let b: [i8x16; 4] = [i8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i8x16; 4] = transmute(vld4q_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 2]; + let b: [i64x1; 4] = [i64x1::new(0), i64x1::new(2), i64x1::new(2), i64x1::new(2)]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 4] = transmute(vld4_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_s64() { + let a: [i64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8]; + let b: [i64x2; 4] = [i64x2::new(0, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)]; + let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)]; + let r: [i64x2; 4] = transmute(vld4q_lane_s64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 2]; + let b: [i64x1; 4] = [i64x1::new(0), i64x1::new(2), i64x1::new(2), i64x1::new(2)]; + let e: [i64x1; 4] = [i64x1::new(1), i64x1::new(2), i64x1::new(2), i64x1::new(2)]; + let r: [i64x1; 4] = transmute(vld4_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_p64() { + let a: [u64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8]; + let b: [i64x2; 4] = [i64x2::new(0, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)]; + let e: [i64x2; 4] = [i64x2::new(1, 2), i64x2::new(2, 2), i64x2::new(2, 16), i64x2::new(2, 18)]; + let r: [i64x2; 4] = transmute(vld4q_lane_p64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_p8() { + let a: [u8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16]; + let b: [i8x16; 4] = [i8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let e: [i8x16; 4] = [i8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), i8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), i8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), i8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [i8x16; 4] = transmute(vld4q_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_u8() { + let a: [u8; 65] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16]; + let b: [u8x16; 4] = [u8x16::new(0, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), u8x16::new(11, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), u8x16::new(1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let e: [u8x16; 4] = [u8x16::new(1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26), u8x16::new(2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26), u8x16::new(2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8), u8x16::new(2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16)]; + let r: [u8x16; 4] = transmute(vld4q_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 2]; + let b: [u64x1; 4] = [u64x1::new(0), u64x1::new(2), u64x1::new(2), u64x1::new(2)]; + let e: [u64x1; 4] = [u64x1::new(1), u64x1::new(2), u64x1::new(2), u64x1::new(2)]; + let r: [u64x1; 4] = transmute(vld4_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_u64() { + let a: [u64; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8]; + let b: [u64x2; 4] = [u64x2::new(0, 2), u64x2::new(2, 2), u64x2::new(2, 16), u64x2::new(2, 18)]; + let e: [u64x2; 4] = [u64x2::new(1, 2), u64x2::new(2, 2), u64x2::new(2, 16), u64x2::new(2, 18)]; + let r: [u64x2; 4] = transmute(vld4q_lane_u64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 2.]; + let b: [f64; 4] = [0., 2., 2., 2.]; + let e: [f64; 4] = [1., 2., 2., 2.]; + let r: [f64; 4] = transmute(vld4_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_f64() { + let a: [f64; 9] = [0., 1., 2., 2., 2., 5., 6., 7., 8.]; + let b: [f64x2; 4] = [f64x2::new(0., 2.), f64x2::new(2., 2.), f64x2::new(2., 16.), f64x2::new(2., 18.)]; + let e: [f64x2; 4] = [f64x2::new(1., 2.), f64x2::new(2., 2.), f64x2::new(2., 16.), f64x2::new(2., 18.)]; + let r: [f64x2; 4] = transmute(vld4q_lane_f64::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_f64_x2() { let a: [f64; 3] = [0., 1., 2.]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index fa74451671..844e4bec5e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -8566,6 +8566,326 @@ pub unsafe fn vld3q_dup_f32(a: *const f32) -> float32x4x3_t { vld3q_dup_f32_(a.cast()) } +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s8(a: *const i8, b: int8x8x3_t) -> int8x8x3_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i8.p0i8")] + fn vld3_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i32, size: i32) -> int8x8x3_t; + } +vld3_lane_s8_(a.cast(), b.0, b.1, b.2, LANE, 1) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s8(a: *const i8, b: int8x8x3_t) -> int8x8x3_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v8i8.p0i8")] + fn vld3_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i64, ptr: *const i8) -> int8x8x3_t; + } +vld3_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s16(a: *const i16, b: int16x4x3_t) -> int16x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i16.p0i8")] + fn vld3_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i32, size: i32) -> int16x4x3_t; + } +vld3_lane_s16_(a.cast(), b.0, b.1, b.2, LANE, 2) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s16(a: *const i16, b: int16x4x3_t) -> int16x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4i16.p0i8")] + fn vld3_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i64, ptr: *const i8) -> int16x4x3_t; + } +vld3_lane_s16_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s32(a: *const i32, b: int32x2x3_t) -> int32x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2i32.p0i8")] + fn vld3_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i32, size: i32) -> int32x2x3_t; + } +vld3_lane_s32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_s32(a: *const i32, b: int32x2x3_t) -> int32x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2i32.p0i8")] + fn vld3_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i64, ptr: *const i8) -> int32x2x3_t; + } +vld3_lane_s32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s16(a: *const i16, b: int16x8x3_t) -> int16x8x3_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v8i16.p0i8")] + fn vld3q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i32, size: i32) -> int16x8x3_t; + } +vld3q_lane_s16_(a.cast(), b.0, b.1, b.2, LANE, 2) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s16(a: *const i16, b: int16x8x3_t) -> int16x8x3_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v8i16.p0i8")] + fn vld3q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i64, ptr: *const i8) -> int16x8x3_t; + } +vld3q_lane_s16_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s32(a: *const i32, b: int32x4x3_t) -> int32x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4i32.p0i8")] + fn vld3q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i32, size: i32) -> int32x4x3_t; + } +vld3q_lane_s32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Load multiple 3-element structures to two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_s32(a: *const i32, b: int32x4x3_t) -> int32x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4i32.p0i8")] + fn vld3q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i64, ptr: *const i8) -> int32x4x3_t; + } +vld3q_lane_s32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_u8(a: *const u8, b: uint8x8x3_t) -> uint8x8x3_t { + static_assert_imm3!(LANE); + transmute(vld3_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_u16(a: *const u16, b: uint16x4x3_t) -> uint16x4x3_t { + static_assert_imm2!(LANE); + transmute(vld3_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_u32(a: *const u32, b: uint32x2x3_t) -> uint32x2x3_t { + static_assert_imm1!(LANE); + transmute(vld3_lane_s32::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_u16(a: *const u16, b: uint16x8x3_t) -> uint16x8x3_t { + static_assert_imm3!(LANE); + transmute(vld3q_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_u32(a: *const u32, b: uint32x4x3_t) -> uint32x4x3_t { + static_assert_imm2!(LANE); + transmute(vld3q_lane_s32::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_p8(a: *const p8, b: poly8x8x3_t) -> poly8x8x3_t { + static_assert_imm3!(LANE); + transmute(vld3_lane_s8::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_p16(a: *const p16, b: poly16x4x3_t) -> poly16x4x3_t { + static_assert_imm2!(LANE); + transmute(vld3_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_p16(a: *const p16, b: poly16x8x3_t) -> poly16x8x3_t { + static_assert_imm3!(LANE); + transmute(vld3q_lane_s16::(transmute(a), transmute(b))) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_f32(a: *const f32, b: float32x2x3_t) -> float32x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v2f32.p0i8")] + fn vld3_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i32, size: i32) -> float32x2x3_t; + } +vld3_lane_f32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3_lane_f32(a: *const f32, b: float32x2x3_t) -> float32x2x3_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v2f32.p0i8")] + fn vld3_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i64, ptr: *const i8) -> float32x2x3_t; + } +vld3_lane_f32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_f32(a: *const f32, b: float32x4x3_t) -> float32x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld3lane.v4f32.p0i8")] + fn vld3q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i32, size: i32) -> float32x4x3_t; + } +vld3q_lane_f32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Load multiple 3-element structures to three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld3q_lane_f32(a: *const f32, b: float32x4x3_t) -> float32x4x3_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld3lane.v4f32.p0i8")] + fn vld3q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i64, ptr: *const i8) -> float32x4x3_t; + } +vld3q_lane_f32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + /// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "arm")] @@ -9154,160 +9474,480 @@ pub unsafe fn vld4_dup_u16(a: *const u16) -> uint16x4x4_t { transmute(vld4_dup_s16(transmute(a))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t { + transmute(vld4_dup_s32(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t { + transmute(vld4q_dup_s8(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t { + transmute(vld4q_dup_s16(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t { + transmute(vld4q_dup_s32(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t { + transmute(vld4_dup_s8(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t { + transmute(vld4_dup_s16(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t { + transmute(vld4q_dup_s8(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t { + transmute(vld4q_dup_s16(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t { + transmute(vld4_dup_s64(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t { + transmute(vld4_dup_s64(transmute(a))) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2f32.p0i8")] + fn vld4_dup_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; + } +vld4_dup_f32_(a as *const i8, 4) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f32.p0f32")] + fn vld4_dup_f32_(ptr: *const f32) -> float32x2x4_t; + } +vld4_dup_f32_(a.cast()) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] +pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f32.p0i8")] + fn vld4q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; + } +vld4q_dup_f32_(a as *const i8, 4) +} + +/// Load single 4-element structure and replicate to all lanes of four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] +pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4f32.p0f32")] + fn vld4q_dup_f32_(ptr: *const f32) -> float32x4x4_t; + } +vld4q_dup_f32_(a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s8(a: *const i8, b: int8x8x4_t) -> int8x8x4_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i8.p0i8")] + fn vld4_lane_s8_(ptr: *const i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i32, size: i32) -> int8x8x4_t; + } +vld4_lane_s8_(a.cast(), b.0, b.1, b.2, b.3, LANE, 1) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s8(a: *const i8, b: int8x8x4_t) -> int8x8x4_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v8i8.p0i8")] + fn vld4_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i64, ptr: *const i8) -> int8x8x4_t; + } +vld4_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s16(a: *const i16, b: int16x4x4_t) -> int16x4x4_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i16.p0i8")] + fn vld4_lane_s16_(ptr: *const i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i32, size: i32) -> int16x4x4_t; + } +vld4_lane_s16_(a.cast(), b.0, b.1, b.2, b.3, LANE, 2) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s16(a: *const i16, b: int16x4x4_t) -> int16x4x4_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4i16.p0i8")] + fn vld4_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i64, ptr: *const i8) -> int16x4x4_t; + } +vld4_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s32(a: *const i32, b: int32x2x4_t) -> int32x2x4_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2i32.p0i8")] + fn vld4_lane_s32_(ptr: *const i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i32, size: i32) -> int32x2x4_t; + } +vld4_lane_s32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_s32(a: *const i32, b: int32x2x4_t) -> int32x2x4_t { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2i32.p0i8")] + fn vld4_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i64, ptr: *const i8) -> int32x2x4_t; + } +vld4_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s16(a: *const i16, b: int16x8x4_t) -> int16x8x4_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v8i16.p0i8")] + fn vld4q_lane_s16_(ptr: *const i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i32, size: i32) -> int16x8x4_t; + } +vld4q_lane_s16_(a.cast(), b.0, b.1, b.2, b.3, LANE, 2) +} + +/// Load multiple 4-element structures to four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s16(a: *const i16, b: int16x8x4_t) -> int16x8x4_t { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v8i16.p0i8")] + fn vld4q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i64, ptr: *const i8) -> int16x8x4_t; + } +vld4q_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Load multiple 4-element structures to four registers #[inline] -#[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_u32(a: *const u32) -> uint32x2x4_t { - transmute(vld4_dup_s32(transmute(a))) +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s32(a: *const i32, b: int32x4x4_t) -> int32x4x4_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4i32.p0i8")] + fn vld4q_lane_s32_(ptr: *const i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i32, size: i32) -> int32x4x4_t; + } +vld4q_lane_s32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_u8(a: *const u8) -> uint8x16x4_t { - transmute(vld4q_dup_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_s32(a: *const i32, b: int32x4x4_t) -> int32x4x4_t { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4i32.p0i8")] + fn vld4q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i64, ptr: *const i8) -> int32x4x4_t; + } +vld4q_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_u16(a: *const u16) -> uint16x8x4_t { - transmute(vld4q_dup_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_u8(a: *const u8, b: uint8x8x4_t) -> uint8x8x4_t { + static_assert_imm3!(LANE); + transmute(vld4_lane_s8::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_u32(a: *const u32) -> uint32x4x4_t { - transmute(vld4q_dup_s32(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_u16(a: *const u16, b: uint16x4x4_t) -> uint16x4x4_t { + static_assert_imm2!(LANE); + transmute(vld4_lane_s16::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_p8(a: *const p8) -> poly8x8x4_t { - transmute(vld4_dup_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_u32(a: *const u32, b: uint32x2x4_t) -> uint32x2x4_t { + static_assert_imm1!(LANE); + transmute(vld4_lane_s32::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_p16(a: *const p16) -> poly16x4x4_t { - transmute(vld4_dup_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_u16(a: *const u16, b: uint16x8x4_t) -> uint16x8x4_t { + static_assert_imm3!(LANE); + transmute(vld4q_lane_s16::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_p8(a: *const p8) -> poly8x16x4_t { - transmute(vld4q_dup_s8(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_u32(a: *const u32, b: uint32x4x4_t) -> uint32x4x4_t { + static_assert_imm2!(LANE); + transmute(vld4q_lane_s32::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_p16(a: *const p16) -> poly16x8x4_t { - transmute(vld4q_dup_s16(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_p8(a: *const p8, b: poly8x8x4_t) -> poly8x8x4_t { + static_assert_imm3!(LANE); + transmute(vld4_lane_s8::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_u64(a: *const u64) -> uint64x1x4_t { - transmute(vld4_dup_s64(transmute(a))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_p16(a: *const p16, b: poly16x4x4_t) -> poly16x4x4_t { + static_assert_imm2!(LANE); + transmute(vld4_lane_s16::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] -#[target_feature(enable = "neon,aes")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_p64(a: *const p64) -> poly64x1x4_t { - transmute(vld4_dup_s64(transmute(a))) +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_p16(a: *const p16, b: poly16x8x4_t) -> poly16x8x4_t { + static_assert_imm3!(LANE); + transmute(vld4q_lane_s16::(transmute(a), transmute(b))) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_f32(a: *const f32, b: float32x2x4_t) -> float32x2x4_t { + static_assert_imm1!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v2f32.p0i8")] - fn vld4_dup_f32_(ptr: *const i8, size: i32) -> float32x2x4_t; + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v2f32.p0i8")] + fn vld4_lane_f32_(ptr: *const i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i32, size: i32) -> float32x2x4_t; } -vld4_dup_f32_(a as *const i8, 4) +vld4_lane_f32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4_dup_f32(a: *const f32) -> float32x2x4_t { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4_lane_f32(a: *const f32, b: float32x2x4_t) -> float32x2x4_t { + static_assert_imm1!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v2f32.p0f32")] - fn vld4_dup_f32_(ptr: *const f32) -> float32x2x4_t; + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v2f32.p0i8")] + fn vld4_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i64, ptr: *const i8) -> float32x2x4_t; } -vld4_dup_f32_(a.cast()) +vld4_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4))] -pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_f32(a: *const f32, b: float32x4x4_t) -> float32x4x4_t { + static_assert_imm2!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4dup.v4f32.p0i8")] - fn vld4q_dup_f32_(ptr: *const i8, size: i32) -> float32x4x4_t; + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vld4lane.v4f32.p0i8")] + fn vld4q_lane_f32_(ptr: *const i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i32, size: i32) -> float32x4x4_t; } -vld4q_dup_f32_(a as *const i8, 4) +vld4q_lane_f32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) } -/// Load single 4-element structure and replicate to all lanes of four registers +/// Load multiple 4-element structures to four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4r))] -pub unsafe fn vld4q_dup_f32(a: *const f32) -> float32x4x4_t { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(ld4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vld4q_lane_f32(a: *const f32, b: float32x4x4_t) -> float32x4x4_t { + static_assert_imm2!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4r.v4f32.p0f32")] - fn vld4q_dup_f32_(ptr: *const f32) -> float32x4x4_t; + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.ld4lane.v4f32.p0i8")] + fn vld4q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i64, ptr: *const i8) -> float32x4x4_t; } -vld4q_dup_f32_(a.cast()) +vld4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } /// Store multiple single-element structures from one, two, three, or four registers @@ -25056,6 +25696,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_s8() { + let a: [i8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 3] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [i8x8; 3] = transmute(vld3_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_s16() { + let a: [i16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4]; + let b: [i16x4; 3] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)]; + let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)]; + let r: [i16x4; 3] = transmute(vld3_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_s32() { + let a: [i32; 7] = [0, 1, 2, 2, 4, 5, 6]; + let b: [i32x2; 3] = [i32x2::new(0, 2), i32x2::new(2, 14), i32x2::new(2, 16)]; + let e: [i32x2; 3] = [i32x2::new(1, 2), i32x2::new(2, 14), i32x2::new(2, 16)]; + let r: [i32x2; 3] = transmute(vld3_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_s16() { + let a: [i16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 3] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [i16x8; 3] = transmute(vld3q_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_s32() { + let a: [i32; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4]; + let b: [i32x4; 3] = [i32x4::new(0, 2, 2, 14), i32x4::new(2, 16, 17, 18), i32x4::new(2, 20, 21, 22)]; + let e: [i32x4; 3] = [i32x4::new(1, 2, 2, 14), i32x4::new(2, 16, 17, 18), i32x4::new(2, 20, 21, 22)]; + let r: [i32x4; 3] = transmute(vld3q_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_u8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u8x8; 3] = [u8x8::new(0, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26), u8x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [u8x8; 3] = [u8x8::new(1, 2, 2, 14, 2, 16, 17, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26), u8x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [u8x8; 3] = transmute(vld3_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_u16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4]; + let b: [u16x4; 3] = [u16x4::new(0, 2, 2, 14), u16x4::new(2, 16, 17, 18), u16x4::new(2, 20, 21, 22)]; + let e: [u16x4; 3] = [u16x4::new(1, 2, 2, 14), u16x4::new(2, 16, 17, 18), u16x4::new(2, 20, 21, 22)]; + let r: [u16x4; 3] = transmute(vld3_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_u32() { + let a: [u32; 7] = [0, 1, 2, 2, 4, 5, 6]; + let b: [u32x2; 3] = [u32x2::new(0, 2), u32x2::new(2, 14), u32x2::new(2, 16)]; + let e: [u32x2; 3] = [u32x2::new(1, 2), u32x2::new(2, 14), u32x2::new(2, 16)]; + let r: [u32x2; 3] = transmute(vld3_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_u16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u16x8; 3] = [u16x8::new(0, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26), u16x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [u16x8; 3] = [u16x8::new(1, 2, 2, 14, 2, 16, 17, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26), u16x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [u16x8; 3] = transmute(vld3q_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_u32() { + let a: [u32; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4]; + let b: [u32x4; 3] = [u32x4::new(0, 2, 2, 14), u32x4::new(2, 16, 17, 18), u32x4::new(2, 20, 21, 22)]; + let e: [u32x4; 3] = [u32x4::new(1, 2, 2, 14), u32x4::new(2, 16, 17, 18), u32x4::new(2, 20, 21, 22)]; + let r: [u32x4; 3] = transmute(vld3q_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_p8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 3] = [i8x8::new(0, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [i8x8; 3] = [i8x8::new(1, 2, 2, 14, 2, 16, 17, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [i8x8; 3] = transmute(vld3_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_p16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4]; + let b: [i16x4; 3] = [i16x4::new(0, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)]; + let e: [i16x4; 3] = [i16x4::new(1, 2, 2, 14), i16x4::new(2, 16, 17, 18), i16x4::new(2, 20, 21, 22)]; + let r: [i16x4; 3] = transmute(vld3_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_p16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 3] = [i16x8::new(0, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 17, 18)]; + let e: [i16x8; 3] = [i16x8::new(1, 2, 2, 14, 2, 16, 17, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 17, 18)]; + let r: [i16x8; 3] = transmute(vld3q_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3_lane_f32() { + let a: [f32; 7] = [0., 1., 2., 2., 4., 5., 6.]; + let b: [f32x2; 3] = [f32x2::new(0., 2.), f32x2::new(2., 14.), f32x2::new(9., 16.)]; + let e: [f32x2; 3] = [f32x2::new(1., 2.), f32x2::new(2., 14.), f32x2::new(2., 16.)]; + let r: [f32x2; 3] = transmute(vld3_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld3q_lane_f32() { + let a: [f32; 13] = [0., 1., 2., 2., 4., 5., 6., 7., 8., 5., 6., 7., 8.]; + let b: [f32x4; 3] = [f32x4::new(0., 2., 2., 14.), f32x4::new(9., 16., 17., 18.), f32x4::new(5., 6., 7., 8.)]; + let e: [f32x4; 3] = [f32x4::new(1., 2., 2., 14.), f32x4::new(2., 16., 17., 18.), f32x4::new(2., 6., 7., 8.)]; + let r: [f32x4; 3] = transmute(vld3q_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vld4_s8() { let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; @@ -25392,6 +26167,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 4] = [i8x8::new(0, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x8; 4] = transmute(vld4_lane_s8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x4; 4] = [i16x4::new(0, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)]; + let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)]; + let r: [i16x4; 4] = transmute(vld4_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8]; + let b: [i32x2; 4] = [i32x2::new(0, 2), i32x2::new(2, 2), i32x2::new(2, 16), i32x2::new(2, 18)]; + let e: [i32x2; 4] = [i32x2::new(1, 2), i32x2::new(2, 2), i32x2::new(2, 16), i32x2::new(2, 18)]; + let r: [i32x2; 4] = transmute(vld4_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_s16() { + let a: [i16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 4] = [i16x8::new(0, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i16x8; 4] = transmute(vld4q_lane_s16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_s32() { + let a: [i32; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i32x4; 4] = [i32x4::new(0, 2, 2, 2), i32x4::new(2, 16, 2, 18), i32x4::new(2, 20, 21, 22), i32x4::new(2, 24, 25, 26)]; + let e: [i32x4; 4] = [i32x4::new(1, 2, 2, 2), i32x4::new(2, 16, 2, 18), i32x4::new(2, 20, 21, 22), i32x4::new(2, 24, 25, 26)]; + let r: [i32x4; 4] = transmute(vld4q_lane_s32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u8x8; 4] = [u8x8::new(0, 2, 2, 2, 2, 16, 2, 18), u8x8::new(2, 20, 21, 22, 2, 24, 25, 26), u8x8::new(11, 12, 13, 14, 15, 16, 2, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [u8x8; 4] = [u8x8::new(1, 2, 2, 2, 2, 16, 2, 18), u8x8::new(2, 20, 21, 22, 2, 24, 25, 26), u8x8::new(2, 12, 13, 14, 15, 16, 2, 18), u8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [u8x8; 4] = transmute(vld4_lane_u8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u16x4; 4] = [u16x4::new(0, 2, 2, 2), u16x4::new(2, 16, 2, 18), u16x4::new(2, 20, 21, 22), u16x4::new(2, 24, 25, 26)]; + let e: [u16x4; 4] = [u16x4::new(1, 2, 2, 2), u16x4::new(2, 16, 2, 18), u16x4::new(2, 20, 21, 22), u16x4::new(2, 24, 25, 26)]; + let r: [u16x4; 4] = transmute(vld4_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 2, 5, 6, 7, 8]; + let b: [u32x2; 4] = [u32x2::new(0, 2), u32x2::new(2, 2), u32x2::new(2, 16), u32x2::new(2, 18)]; + let e: [u32x2; 4] = [u32x2::new(1, 2), u32x2::new(2, 2), u32x2::new(2, 16), u32x2::new(2, 18)]; + let r: [u32x2; 4] = transmute(vld4_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_u16() { + let a: [u16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u16x8; 4] = [u16x8::new(0, 2, 2, 2, 2, 16, 2, 18), u16x8::new(2, 20, 21, 22, 2, 24, 25, 26), u16x8::new(11, 12, 13, 14, 15, 16, 2, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [u16x8; 4] = [u16x8::new(1, 2, 2, 2, 2, 16, 2, 18), u16x8::new(2, 20, 21, 22, 2, 24, 25, 26), u16x8::new(2, 12, 13, 14, 15, 16, 2, 18), u16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [u16x8; 4] = transmute(vld4q_lane_u16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_u32() { + let a: [u32; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [u32x4; 4] = [u32x4::new(0, 2, 2, 2), u32x4::new(2, 16, 2, 18), u32x4::new(2, 20, 21, 22), u32x4::new(2, 24, 25, 26)]; + let e: [u32x4; 4] = [u32x4::new(1, 2, 2, 2), u32x4::new(2, 16, 2, 18), u32x4::new(2, 20, 21, 22), u32x4::new(2, 24, 25, 26)]; + let r: [u32x4; 4] = transmute(vld4q_lane_u32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i8x8; 4] = [i8x8::new(0, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(11, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i8x8; 4] = [i8x8::new(1, 2, 2, 2, 2, 16, 2, 18), i8x8::new(2, 20, 21, 22, 2, 24, 25, 26), i8x8::new(2, 12, 13, 14, 15, 16, 2, 18), i8x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i8x8; 4] = transmute(vld4_lane_p8::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x4; 4] = [i16x4::new(0, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)]; + let e: [i16x4; 4] = [i16x4::new(1, 2, 2, 2), i16x4::new(2, 16, 2, 18), i16x4::new(2, 20, 21, 22), i16x4::new(2, 24, 25, 26)]; + let r: [i16x4; 4] = transmute(vld4_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_p16() { + let a: [u16; 33] = [0, 1, 2, 2, 2, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8]; + let b: [i16x8; 4] = [i16x8::new(0, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(11, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let e: [i16x8; 4] = [i16x8::new(1, 2, 2, 2, 2, 16, 2, 18), i16x8::new(2, 20, 21, 22, 2, 24, 25, 26), i16x8::new(2, 12, 13, 14, 15, 16, 2, 18), i16x8::new(2, 20, 21, 22, 23, 24, 25, 26)]; + let r: [i16x8; 4] = transmute(vld4q_lane_p16::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4_lane_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 2., 5., 6., 7., 8.]; + let b: [f32x2; 4] = [f32x2::new(0., 2.), f32x2::new(2., 2.), f32x2::new(2., 16.), f32x2::new(2., 18.)]; + let e: [f32x2; 4] = [f32x2::new(1., 2.), f32x2::new(2., 2.), f32x2::new(2., 16.), f32x2::new(2., 18.)]; + let r: [f32x2; 4] = transmute(vld4_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vld4q_lane_f32() { + let a: [f32; 17] = [0., 1., 2., 2., 2., 5., 6., 7., 8., 5., 6., 7., 8., 1., 4., 3., 5.]; + let b: [f32x4; 4] = [f32x4::new(0., 2., 2., 2.), f32x4::new(2., 16., 2., 18.), f32x4::new(5., 6., 7., 8.), f32x4::new(1., 4., 3., 5.)]; + let e: [f32x4; 4] = [f32x4::new(1., 2., 2., 2.), f32x4::new(2., 16., 2., 18.), f32x4::new(2., 6., 7., 8.), f32x4::new(2., 4., 3., 5.)]; + let r: [f32x4; 4] = transmute(vld4q_lane_f32::<0>(a[1..].as_ptr(), transmute(b))); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_s8_x2() { let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 1b0250876f..554afbeaa0 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2442,16 +2442,16 @@ validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 1 load_fn arm-aarch64-separate -aarch64 = ld3lane +aarch64 = ld3 const-aarch64 = LANE link-aarch64 = ld3lane._EXTpi82_ -//generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t +generate *const i8:int8x16x3_t:int8x16x3_t, *const i64:int64x1x3_t:int64x1x3_t, *const i64:int64x2x3_t:int64x2x3_t -arm = vld3lane +arm = vld3 const-arm = LANE link-arm = vld3lane._EXTpi82_ -//generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t -//generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t +generate *const i8:int8x8x3_t:int8x8x3_t, *const i16:int16x4x3_t:int16x4x3_t, *const i32:int32x2x3_t:int32x2x3_t +generate *const i16:int16x8x3_t:int16x8x3_t, *const i32:int32x4x3_t:int32x4x3_t /// Load multiple 3-element structures to three registers name = vld3 @@ -2465,19 +2465,19 @@ n = 0 validate 1, 2, 2, 14, 2, 16, 17, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8 load_fn -aarch64 = ld3lane +aarch64 = ld3 const-aarch64 = LANE target = aes -//generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t +generate *const p64:poly64x1x3_t:poly64x1x3_t, *const p64:poly64x2x3_t:poly64x2x3_t target = default -//generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t +generate *const p8:poly8x16x3_t:poly8x16x3_t, *const u8:uint8x16x3_t:uint8x16x3_t, *const u64:uint64x1x3_t:uint64x1x3_t, *const u64:uint64x2x3_t:uint64x2x3_t -arm = vld3lane +arm = vld3 const-arm = LANE -//generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t -//generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t -//generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t -//generate *const p16:poly16x8x3_t:poly16x8x3_t +generate *const u8:uint8x8x3_t:uint8x8x3_t, *const u16:uint16x4x3_t:uint16x4x3_t, *const u32:uint32x2x3_t:uint32x2x3_t +generate *const u16:uint16x8x3_t:uint16x8x3_t, *const u32:uint32x4x3_t:uint32x4x3_t +generate *const p8:poly8x8x3_t:poly8x8x3_t, *const p16:poly16x4x3_t:poly16x4x3_t +generate *const p16:poly16x8x3_t:poly16x8x3_t /// Load multiple 3-element structures to three registers name = vld3 @@ -2491,15 +2491,15 @@ validate 1., 2., 2., 14., 2., 16., 17., 18., 2., 6., 7., 8. load_fn arm-aarch64-separate -aarch64 = ld3lane +aarch64 = ld3 const-aarch64 = LANE link-aarch64 = ld3lane._EXTpi82_ -//generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t +generate *const f64:float64x1x3_t:float64x1x3_t, *const f64:float64x2x3_t:float64x2x3_t -arm = vld3lane +arm = vld3 const-arm = LANE link-arm = vld3lane._EXTpi82_ -//generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t +generate *const f32:float32x2x3_t:float32x2x3_t, *const f32:float32x4x3_t:float32x4x3_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2633,16 +2633,16 @@ validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, load_fn arm-aarch64-separate -aarch64 = ld4lane +aarch64 = ld4 const-aarch64 = LANE link-aarch64 = ld4lane._EXTpi82_ -//generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t +generate *const i8:int8x16x4_t:int8x16x4_t, *const i64:int64x1x4_t:int64x1x4_t, *const i64:int64x2x4_t:int64x2x4_t -arm = vld4lane +arm = vld4 const-arm = LANE link-arm = vld4lane._EXTpi82_ -//generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t -//generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t +generate *const i8:int8x8x4_t:int8x8x4_t, *const i16:int16x4x4_t:int16x4x4_t, *const i32:int32x2x4_t:int32x2x4_t +generate *const i16:int16x8x4_t:int16x8x4_t, *const i32:int32x4x4_t:int32x4x4_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2656,19 +2656,19 @@ n = 0 validate 1, 2, 2, 2, 2, 16, 2, 18, 2, 20, 21, 22, 2, 24, 25, 26, 2, 12, 13, 14, 15, 16, 2, 18, 2, 20, 21, 22, 23, 24, 25, 26, 2, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 2, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16 load_fn -aarch64 = ld4lane +aarch64 = ld4 const-aarch64 = LANE target = aes -//generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t +generate *const p64:poly64x1x4_t:poly64x1x4_t, *const p64:poly64x2x4_t:poly64x2x4_t target = default -//generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t +generate *const p8:poly8x16x4_t:poly8x16x4_t, *const u8:uint8x16x4_t:uint8x16x4_t, *const u64:uint64x1x4_t:uint64x1x4_t, *const u64:uint64x2x4_t:uint64x2x4_t -arm = vld4lane +arm = vld4 const-arm = LANE -//generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t -//generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t -//generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t -//generate *const p16:poly16x8x4_t:poly16x8x4_t +generate *const u8:uint8x8x4_t:uint8x8x4_t, *const u16:uint16x4x4_t:uint16x4x4_t, *const u32:uint32x2x4_t:uint32x2x4_t +generate *const u16:uint16x8x4_t:uint16x8x4_t, *const u32:uint32x4x4_t:uint32x4x4_t +generate *const p8:poly8x8x4_t:poly8x8x4_t, *const p16:poly16x4x4_t:poly16x4x4_t +generate *const p16:poly16x8x4_t:poly16x8x4_t /// Load multiple 4-element structures to four registers name = vld4 @@ -2682,15 +2682,15 @@ validate 1., 2., 2., 2., 2., 16., 2., 18., 2., 6., 7., 8., 2., 4., 3., 5. load_fn arm-aarch64-separate -aarch64 = ld4lane +aarch64 = ld4 const-aarch64 = LANE link-aarch64 = ld4lane._EXTpi82_ -//generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t +generate *const f64:float64x1x4_t:float64x1x4_t, *const f64:float64x2x4_t:float64x2x4_t -arm = vld4lane +arm = vld4 const-arm = LANE link-arm = vld4lane._EXTpi82_ -//generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t +generate *const f32:float32x2x4_t:float32x2x4_t, *const f32:float32x4x4_t:float32x4x4_t /// Store multiple single-element structures from one, two, three, or four registers name = vst1 From d2b7d22e92b4e2973b54693b47feeb53117e6bd3 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:34:29 +0800 Subject: [PATCH 18/28] change instr limit --- crates/stdarch-test/src/lib.rs | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index a62bddbad4..cf4b0e28b8 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -131,6 +131,12 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29, // core_arch/src/arm_shared/simd32 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) + "vld3" => 41, + // core_arch/src/arm_shared/simd32 + // vld4q_lane_u32_vld4 : #instructions = 32 >= 22 (limit) + "vld4" => 23, + // core_arch/src/arm_shared/simd32 + // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) "vst1" => 41, // Temporary, currently the fptosi.sat and fptoui.sat LLVM From 1dbf5a23d69f9f5cb1ca53855513e2d1f2136731 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:42:57 +0800 Subject: [PATCH 19/28] change instr limit --- crates/stdarch-test/src/lib.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index cf4b0e28b8..c66a7c93d2 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -131,10 +131,10 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29, // core_arch/src/arm_shared/simd32 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) - "vld3" => 41, + "vld3" => 23, // core_arch/src/arm_shared/simd32 // vld4q_lane_u32_vld4 : #instructions = 32 >= 22 (limit) - "vld4" => 23, + "vld4" => 32, // core_arch/src/arm_shared/simd32 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) "vst1" => 41, From ceb184385a05b161f9e39eb1cb9ddf71b0d29327 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 16:58:07 +0800 Subject: [PATCH 20/28] add vst1_lane neon instr --- .../core_arch/src/aarch64/neon/generated.rs | 38 ++ .../src/arm_shared/neon/generated.rs | 504 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 16 +- crates/stdarch-test/src/lib.rs | 4 +- 4 files changed, 552 insertions(+), 10 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 477d614b30..cefde1bf26 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5327,6 +5327,26 @@ pub unsafe fn vld4q_lane_f64(a: *const f64, b: float64x2x4_t) - vld4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_f64(a: *mut f64, b: float64x1_t) { + static_assert!(LANE : i32 where LANE == 0); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_f64(a: *mut f64, b: float64x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + /// Store multiple single-element structures to one, two, three, or four registers #[inline] #[target_feature(enable = "neon")] @@ -14333,6 +14353,24 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_f64() { + let a: [f64; 2] = [0., 1.]; + let e: [f64; 1] = [1.]; + let mut r: [f64; 1] = [0f64; 1]; + vst1_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let e: [f64; 2] = [1., 0.]; + let mut r: [f64; 2] = [0f64; 2]; + vst1q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_f64_x2() { let a: [f64; 3] = [0., 1., 2.]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 844e4bec5e..a8e00138c3 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -9950,6 +9950,294 @@ pub unsafe fn vld4q_lane_f32(a: *const f32, b: float32x4x4_t) - vld4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_s8(a: *mut i8, b: int8x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_s16(a: *mut i16, b: int16x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_s32(a: *mut i32, b: int32x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_s64(a: *mut i64, b: int64x1_t) { + static_assert!(LANE : i32 where LANE == 0); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_s8(a: *mut i8, b: int8x16_t) { + static_assert_imm4!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_s16(a: *mut i16, b: int16x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_s32(a: *mut i32, b: int32x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_s64(a: *mut i64, b: int64x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_u8(a: *mut u8, b: uint8x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_u16(a: *mut u16, b: uint16x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_u32(a: *mut u32, b: uint32x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_u64(a: *mut u64, b: uint64x1_t) { + static_assert!(LANE : i32 where LANE == 0); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_u8(a: *mut u8, b: uint8x16_t) { + static_assert_imm4!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_u16(a: *mut u16, b: uint16x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_u32(a: *mut u32, b: uint32x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_u64(a: *mut u64, b: uint64x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_p8(a: *mut p8, b: poly8x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_p16(a: *mut p16, b: poly16x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_p8(a: *mut p8, b: poly8x16_t) { + static_assert_imm4!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_p16(a: *mut p16, b: poly16x8_t) { + static_assert_imm3!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_p64(a: *mut p64, b: poly64x1_t) { + static_assert!(LANE : i32 where LANE == 0); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_p64(a: *mut p64, b: poly64x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1_lane_f32(a: *mut f32, b: float32x2_t) { + static_assert_imm1!(LANE); + *a = simd_extract(b, LANE as u32); +} + +/// Store multiple single-element structures from one, two, three, or four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst1q_lane_f32(a: *mut f32, b: float32x4_t) { + static_assert_imm2!(LANE); + *a = simd_extract(b, LANE as u32); +} + /// Store multiple single-element structures from one, two, three, or four registers #[inline] #[cfg(target_arch = "arm")] @@ -26302,6 +26590,222 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_s8() { + let a: [i8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [i8; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 8] = [0i8; 8]; + vst1_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_s16() { + let a: [i16; 5] = [0, 1, 2, 3, 4]; + let e: [i16; 4] = [1, 0, 0, 0]; + let mut r: [i16; 4] = [0i16; 4]; + vst1_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_s32() { + let a: [i32; 3] = [0, 1, 2]; + let e: [i32; 2] = [1, 0]; + let mut r: [i32; 2] = [0i32; 2]; + vst1_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_s64() { + let a: [i64; 2] = [0, 1]; + let e: [i64; 1] = [1]; + let mut r: [i64; 1] = [0i64; 1]; + vst1_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_s8() { + let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let e: [i8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 16] = [0i8; 16]; + vst1q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_s16() { + let a: [i16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [i16; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 8] = [0i16; 8]; + vst1q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_s32() { + let a: [i32; 5] = [0, 1, 2, 3, 4]; + let e: [i32; 4] = [1, 0, 0, 0]; + let mut r: [i32; 4] = [0i32; 4]; + vst1q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64; 2] = [1, 0]; + let mut r: [i64; 2] = [0i64; 2]; + vst1q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_u8() { + let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [u8; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 8] = [0u8; 8]; + vst1_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_u16() { + let a: [u16; 5] = [0, 1, 2, 3, 4]; + let e: [u16; 4] = [1, 0, 0, 0]; + let mut r: [u16; 4] = [0u16; 4]; + vst1_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_u32() { + let a: [u32; 3] = [0, 1, 2]; + let e: [u32; 2] = [1, 0]; + let mut r: [u32; 2] = [0u32; 2]; + vst1_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_u64() { + let a: [u64; 2] = [0, 1]; + let e: [u64; 1] = [1]; + let mut r: [u64; 1] = [0u64; 1]; + vst1_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_u8() { + let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let e: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 16] = [0u8; 16]; + vst1q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_u16() { + let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [u16; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 8] = [0u16; 8]; + vst1q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_u32() { + let a: [u32; 5] = [0, 1, 2, 3, 4]; + let e: [u32; 4] = [1, 0, 0, 0]; + let mut r: [u32; 4] = [0u32; 4]; + vst1q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 0]; + let mut r: [u64; 2] = [0u64; 2]; + vst1q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_p8() { + let a: [u8; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [u8; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 8] = [0u8; 8]; + vst1_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_p16() { + let a: [u16; 5] = [0, 1, 2, 3, 4]; + let e: [u16; 4] = [1, 0, 0, 0]; + let mut r: [u16; 4] = [0u16; 4]; + vst1_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_p8() { + let a: [u8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; + let e: [u8; 16] = [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 16] = [0u8; 16]; + vst1q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_p16() { + let a: [u16; 9] = [0, 1, 2, 3, 4, 5, 6, 7, 8]; + let e: [u16; 8] = [1, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 8] = [0u16; 8]; + vst1q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_p64() { + let a: [u64; 2] = [0, 1]; + let e: [u64; 1] = [1]; + let mut r: [u64; 1] = [0u64; 1]; + vst1_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_p64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 0]; + let mut r: [u64; 2] = [0u64; 2]; + vst1q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1_lane_f32() { + let a: [f32; 3] = [0., 1., 2.]; + let e: [f32; 2] = [1., 0.]; + let mut r: [f32; 2] = [0f32; 2]; + vst1_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst1q_lane_f32() { + let a: [f32; 5] = [0., 1., 2., 3., 4.]; + let e: [f32; 4] = [1., 0., 0., 0.]; + let mut r: [f32; 4] = [0f32; 4]; + vst1q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst1_s8_x2() { let a: [i8; 17] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 554afbeaa0..6a5bc7a08a 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2705,13 +2705,13 @@ store_fn aarch64 = nop arm = nop -//generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void -//generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void -//generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void -//generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void -//generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void +generate *mut i8:int8x8_t:void, *mut i16:int16x4_t:void, *mut i32:int32x2_t:void, *mut i64:int64x1_t:void +generate *mut i8:int8x16_t:void, *mut i16:int16x8_t:void, *mut i32:int32x4_t:void, *mut i64:int64x2_t:void +generate *mut u8:uint8x8_t:void, *mut u16:uint16x4_t:void, *mut u32:uint32x2_t:void, *mut u64:uint64x1_t:void +generate *mut u8:uint8x16_t:void, *mut u16:uint16x8_t:void, *mut u32:uint32x4_t:void, *mut u64:uint64x2_t:void +generate *mut p8:poly8x8_t:void, *mut p16:poly16x4_t:void, *mut p8:poly8x16_t:void, *mut p16:poly16x8_t:void target = aes -//generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void +generate *mut p64:poly64x1_t:void, *mut p64:poly64x2_t:void /// Store multiple single-element structures from one, two, three, or four registers name = vst1 @@ -2725,10 +2725,10 @@ validate 1., 0., 0., 0., 0., 0., 0., 0. store_fn aarch64 = nop -//generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void +generate *mut f64:float64x1_t:void, *mut f64:float64x2_t:void arm = nop -//generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void +generate *mut f32:float32x2_t:void, *mut f32:float32x4_t:void /// Store multiple single-element structures from one, two, three, or four registers name = vst1 diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index c66a7c93d2..cc7607bac6 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -130,10 +130,10 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { "usad8" | "vfma" | "vfms" => 27, "qadd8" | "qsub8" | "sadd8" | "sel" | "shadd8" | "shsub8" | "usub8" | "ssub8" => 29, // core_arch/src/arm_shared/simd32 - // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) + // vst1q_s64_x4_vst1 : #instructions = 22 >= 22 (limit) "vld3" => 23, // core_arch/src/arm_shared/simd32 - // vld4q_lane_u32_vld4 : #instructions = 32 >= 22 (limit) + // vld4q_lane_u32_vld4 : #instructions = 31 >= 22 (limit) "vld4" => 32, // core_arch/src/arm_shared/simd32 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) From 2cb18a4efe4a796a5ed76c84515cd09f743260d3 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:07:47 +0800 Subject: [PATCH 21/28] add vst2 neon instr --- .../core_arch/src/aarch64/neon/generated.rs | 100 ++++ .../src/arm_shared/neon/generated.rs | 561 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 22 +- 3 files changed, 672 insertions(+), 11 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index cefde1bf26..e51e34ff2d 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5425,6 +5425,61 @@ pub unsafe fn vst1q_f64_x4(a: *mut f64, b: float64x2x4_t) { vst1q_f64_x4_(b.0, b.1, b.2, b.3, a) } +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2))] +pub unsafe fn vst2q_s64(a: *mut i64, b: int64x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2i64.p0i8")] + fn vst2q_s64_(a: int64x2_t, b: int64x2_t, ptr: *mut i8); + } + vst2q_s64_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2))] +pub unsafe fn vst2q_u64(a: *mut u64, b: uint64x2x2_t) { + transmute(vst2q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st2))] +pub unsafe fn vst2q_p64(a: *mut p64, b: poly64x2x2_t) { + transmute(vst2q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2))] +pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1f64.p0i8")] + fn vst2_f64_(a: float64x1_t, b: float64x1_t, ptr: *mut i8); + } + vst2_f64_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2))] +pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2f64.p0i8")] + fn vst2q_f64_(a: float64x2_t, b: float64x2_t, ptr: *mut i8); + } + vst2q_f64_(b.0, b.1, a.cast()) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -14425,6 +14480,51 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 3]; + let e: [i64; 4] = [1, 2, 2, 3]; + let mut r: [i64; 4] = [0i64; 4]; + vst2q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [u64; 4] = [1, 2, 2, 3]; + let mut r: [u64; 4] = [0u64; 4]; + vst2q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [u64; 4] = [1, 2, 2, 3]; + let mut r: [u64; 4] = [0u64; 4]; + vst2q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let e: [f64; 2] = [1., 2.]; + let mut r: [f64; 2] = [0f64; 2]; + vst2_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 3.]; + let e: [f64; 4] = [1., 2., 2., 3.]; + let mut r: [f64; 4] = [0f64; 4]; + vst2q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index a8e00138c3..8c09edb49c 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11498,6 +11498,378 @@ pub unsafe fn vst1q_f32_x4(a: *mut f32, b: float32x4x4_t) { vst1q_f32_x4_(b.0, b.1, b.2, b.3, a) } +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v8i8")] + fn vst2_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, size: i32); + } +vst2_s8_(a.cast(), b.0, b.1, 1) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_s8(a: *mut i8, b: int8x8x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v8i8.p0i8")] + fn vst2_s8_(a: int8x8_t, b: int8x8_t, ptr: *mut i8); + } +vst2_s8_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4i16")] + fn vst2_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, size: i32); + } +vst2_s16_(a.cast(), b.0, b.1, 2) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_s16(a: *mut i16, b: int16x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4i16.p0i8")] + fn vst2_s16_(a: int16x4_t, b: int16x4_t, ptr: *mut i8); + } +vst2_s16_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v2i32")] + fn vst2_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, size: i32); + } +vst2_s32_(a.cast(), b.0, b.1, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2i32.p0i8")] + fn vst2_s32_(a: int32x2_t, b: int32x2_t, ptr: *mut i8); + } +vst2_s32_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v1i64")] + fn vst2_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32); + } +vst2_s64_(a.cast(), b.0, b.1, 8) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1i64.p0i8")] + fn vst2_s64_(a: int64x1_t, b: int64x1_t, ptr: *mut i8); + } +vst2_s64_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v16i8")] + fn vst2q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, size: i32); + } +vst2q_s8_(a.cast(), b.0, b.1, 1) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_s8(a: *mut i8, b: int8x16x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v16i8.p0i8")] + fn vst2q_s8_(a: int8x16_t, b: int8x16_t, ptr: *mut i8); + } +vst2q_s8_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v8i16")] + fn vst2q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, size: i32); + } +vst2q_s16_(a.cast(), b.0, b.1, 2) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_s16(a: *mut i16, b: int16x8x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v8i16.p0i8")] + fn vst2q_s16_(a: int16x8_t, b: int16x8_t, ptr: *mut i8); + } +vst2q_s16_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4i32")] + fn vst2q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, size: i32); + } +vst2q_s32_(a.cast(), b.0, b.1, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_s32(a: *mut i32, b: int32x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4i32.p0i8")] + fn vst2q_s32_(a: int32x4_t, b: int32x4_t, ptr: *mut i8); + } +vst2q_s32_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) { + transmute(vst2_s8(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) { + transmute(vst2_s16(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) { + transmute(vst2_s32(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) { + transmute(vst2_s64(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_u8(a: *mut u8, b: uint8x16x2_t) { + transmute(vst2q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_u16(a: *mut u16, b: uint16x8x2_t) { + transmute(vst2q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_u32(a: *mut u32, b: uint32x4x2_t) { + transmute(vst2q_s32(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_p8(a: *mut p8, b: poly8x8x2_t) { + transmute(vst2_s8(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_p16(a: *mut p16, b: poly16x4x2_t) { + transmute(vst2_s16(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_p8(a: *mut p8, b: poly8x16x2_t) { + transmute(vst2q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) { + transmute(vst2q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) { + transmute(vst2_s64(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v2f32")] + fn vst2_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, size: i32); + } +vst2_f32_(a.cast(), b.0, b.1, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_f32(a: *mut f32, b: float32x2x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v2f32.p0i8")] + fn vst2_f32_(a: float32x2_t, b: float32x2_t, ptr: *mut i8); + } +vst2_f32_(b.0, b.1, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v4f32")] + fn vst2q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, size: i32); + } +vst2q_f32_(a.cast(), b.0, b.1, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v4f32.p0i8")] + fn vst2q_f32_(a: float32x4_t, b: float32x4_t, ptr: *mut i8); + } +vst2q_f32_(b.0, b.1, a.cast()) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -27454,6 +27826,195 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2_s8() { + let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [i8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [i8; 16] = [0i8; 16]; + vst2_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_s16() { + let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [i16; 8] = [1, 2, 2, 3, 2, 4, 3, 5]; + let mut r: [i16; 8] = [0i16; 8]; + vst2_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_s32() { + let a: [i32; 5] = [0, 1, 2, 2, 3]; + let e: [i32; 4] = [1, 2, 2, 3]; + let mut r: [i32; 4] = [0i32; 4]; + vst2_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64; 2] = [1, 2]; + let mut r: [i64; 2] = [0i64; 2]; + vst2_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [i8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let mut r: [i8; 32] = [0i8; 32]; + vst2q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [i16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [i16; 16] = [0i16; 16]; + vst2q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [i32; 8] = [1, 2, 2, 3, 2, 4, 3, 5]; + let mut r: [i32; 8] = [0i32; 8]; + vst2q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_u8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [u8; 16] = [0u8; 16]; + vst2_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_u16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u16; 8] = [1, 2, 2, 3, 2, 4, 3, 5]; + let mut r: [u16; 8] = [0u16; 8]; + vst2_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_u32() { + let a: [u32; 5] = [0, 1, 2, 2, 3]; + let e: [u32; 4] = [1, 2, 2, 3]; + let mut r: [u32; 4] = [0u32; 4]; + vst2_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 2]; + let mut r: [u64; 2] = [0u64; 2]; + vst2_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [u8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let mut r: [u8; 32] = [0u8; 32]; + vst2q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [u16; 16] = [0u16; 16]; + vst2q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u32; 8] = [1, 2, 2, 3, 2, 4, 3, 5]; + let mut r: [u32; 8] = [0u32; 8]; + vst2q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_p8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u8; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [u8; 16] = [0u8; 16]; + vst2_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_p16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u16; 8] = [1, 2, 2, 3, 2, 4, 3, 5]; + let mut r: [u16; 8] = [0u16; 8]; + vst2_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [u8; 32] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15, 8, 16, 9, 17]; + let mut r: [u8; 32] = [0u8; 32]; + vst2q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u16; 16] = [1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9]; + let mut r: [u16; 16] = [0u16; 16]; + vst2q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_p64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 2]; + let mut r: [u64; 2] = [0u64; 2]; + vst2_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_f32() { + let a: [f32; 5] = [0., 1., 2., 2., 3.]; + let e: [f32; 4] = [1., 2., 2., 3.]; + let mut r: [f32; 4] = [0f32; 4]; + vst2_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 3., 4., 5.]; + let e: [f32; 8] = [1., 2., 2., 3., 2., 4., 3., 5.]; + let mut r: [f32; 8] = [0f32; 8]; + vst2q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_s8() { let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 6a5bc7a08a..97f83a9d38 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2819,12 +2819,12 @@ arm-aarch64-separate aarch64 = st2 link-aarch64 = st2._EXTpi8_ -//generate *mut i64:int64x2x2_t:void +generate *mut i64:int64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ -//generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void -//generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void +generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void /// Store multiple 2-element structures from two registers name = vst2 @@ -2835,17 +2835,17 @@ validate 1, 2, 2, 3, 2, 4, 3, 5, 2, 6, 3, 7, 4, 8, 5, 9, 2, 10, 3, 11, 4, 12, 5, store_fn aarch64 = st2 -//generate *mut u64:uint64x2x2_t:void +generate *mut u64:uint64x2x2_t:void target = aes -//generate *mut p64:poly64x2x2_t:void +generate *mut p64:poly64x2x2_t:void target = default arm = vst2 -//generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void -//generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void -//generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void +generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void +generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void target = aes -//generate *mut p64:poly64x1x2_t:void +generate *mut p64:poly64x1x2_t:void /// Store multiple 2-element structures from two registers name = vst2 @@ -2857,11 +2857,11 @@ arm-aarch64-separate aarch64 = st2 link-aarch64 = st2._EXTpi8_ -//generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void +generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ -//generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void +generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void /// Store multiple 2-element structures from two registers name = vst2 From 1a900a998d64f50734f7df642135ac59654fee74 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:13:33 +0800 Subject: [PATCH 22/28] correct assert_instr --- .../src/arm_shared/neon/generated.rs | 114 +++++++++--------- crates/stdarch-gen/neon.spec | 8 +- 2 files changed, 63 insertions(+), 59 deletions(-) diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 8c09edb49c..8ab14c199e 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11582,34 +11582,6 @@ pub unsafe fn vst2_s32(a: *mut i32, b: int32x2x2_t) { vst2_s32_(b.0, b.1, a.cast()) } -/// Store multiple 2-element structures from two registers -#[inline] -#[cfg(target_arch = "arm")] -#[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] -pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v1i64")] - fn vst2_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32); - } -vst2_s64_(a.cast(), b.0, b.1, 8) -} - -/// Store multiple 2-element structures from two registers -#[inline] -#[cfg(target_arch = "aarch64")] -#[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] -pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { - #[allow(improper_ctypes)] - extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1i64.p0i8")] - fn vst2_s64_(a: int64x1_t, b: int64x1_t, ptr: *mut i8); - } -vst2_s64_(b.0, b.1, a.cast()) -} - /// Store multiple 2-element structures from two registers #[inline] #[cfg(target_arch = "arm")] @@ -11696,12 +11668,30 @@ vst2q_s32_(b.0, b.1, a.cast()) /// Store multiple 2-element structures from two registers #[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2.p0i8.v1i64")] + fn vst2_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, size: i32); + } +vst2_s64_(a.cast(), b.0, b.1, 8) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] -pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) { - transmute(vst2_s8(transmute(a), transmute(b))) +pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2.v1i64.p0i8")] + fn vst2_s64_(a: int64x1_t, b: int64x1_t, ptr: *mut i8); + } +vst2_s64_(b.0, b.1, a.cast()) } /// Store multiple 2-element structures from two registers @@ -11710,8 +11700,8 @@ pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] -pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) { - transmute(vst2_s16(transmute(a), transmute(b))) +pub unsafe fn vst2_u8(a: *mut u8, b: uint8x8x2_t) { + transmute(vst2_s8(transmute(a), transmute(b))) } /// Store multiple 2-element structures from two registers @@ -11720,8 +11710,8 @@ pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] -pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) { - transmute(vst2_s32(transmute(a), transmute(b))) +pub unsafe fn vst2_u16(a: *mut u16, b: uint16x4x2_t) { + transmute(vst2_s16(transmute(a), transmute(b))) } /// Store multiple 2-element structures from two registers @@ -11730,8 +11720,8 @@ pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) { #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] -pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) { - transmute(vst2_s64(transmute(a), transmute(b))) +pub unsafe fn vst2_u32(a: *mut u32, b: uint32x2x2_t) { + transmute(vst2_s32(transmute(a), transmute(b))) } /// Store multiple 2-element structures from two registers @@ -11804,11 +11794,21 @@ pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) { transmute(vst2q_s16(transmute(a), transmute(b))) } +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) { + transmute(vst2_s64(transmute(a), transmute(b))) +} + /// Store multiple 2-element structures from two registers #[inline] #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] #[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) { transmute(vst2_s64(transmute(a), transmute(b))) @@ -27853,15 +27853,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vst2_s64() { - let a: [i64; 3] = [0, 1, 2]; - let e: [i64; 2] = [1, 2]; - let mut r: [i64; 2] = [0i64; 2]; - vst2_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vst2q_s8() { let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; @@ -27889,6 +27880,15 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64; 2] = [1, 2]; + let mut r: [i64; 2] = [0i64; 2]; + vst2_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst2_u8() { let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; @@ -27916,15 +27916,6 @@ mod test { assert_eq!(r, e); } - #[simd_test(enable = "neon")] - unsafe fn test_vst2_u64() { - let a: [u64; 3] = [0, 1, 2]; - let e: [u64; 2] = [1, 2]; - let mut r: [u64; 2] = [0u64; 2]; - vst2_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); - assert_eq!(r, e); - } - #[simd_test(enable = "neon")] unsafe fn test_vst2q_u8() { let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; @@ -27988,6 +27979,15 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 2]; + let mut r: [u64; 2] = [0u64; 2]; + vst2_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst2_p64() { let a: [u64; 3] = [0, 1, 2]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 97f83a9d38..8c1810ef3d 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2823,8 +2823,10 @@ generate *mut i64:int64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ -generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void, *mut i64:int64x1x2_t:void +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void +arm = nop +generate *mut i64:int64x1x2_t:void /// Store multiple 2-element structures from two registers name = vst2 @@ -2841,9 +2843,11 @@ generate *mut p64:poly64x2x2_t:void target = default arm = vst2 -generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void, *mut u64:uint64x1x2_t:void +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void +arm = nop +generate *mut u64:uint64x1x2_t:void target = aes generate *mut p64:poly64x1x2_t:void From 9f124b296042cd474be68502532f548f80bc4a50 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:18:26 +0800 Subject: [PATCH 23/28] correct assert_instr --- crates/core_arch/src/aarch64/neon/generated.rs | 2 +- crates/core_arch/src/arm_shared/neon/generated.rs | 6 +++--- crates/stdarch-gen/neon.spec | 8 ++++++-- 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index e51e34ff2d..75bb668f7f 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5457,7 +5457,7 @@ pub unsafe fn vst2q_p64(a: *mut p64, b: poly64x2x2_t) { /// Store multiple 2-element structures from two registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(st2))] +#[cfg_attr(test, assert_instr(st1))] pub unsafe fn vst2_f64(a: *mut f64, b: float64x1x2_t) { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 8ab14c199e..4265a5138d 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11684,7 +11684,7 @@ vst2_s64_(a.cast(), b.0, b.1, 8) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vst2_s64(a: *mut i64, b: int64x1x2_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -11799,7 +11799,7 @@ pub unsafe fn vst2q_p16(a: *mut p16, b: poly16x8x2_t) { #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) { transmute(vst2_s64(transmute(a), transmute(b))) } @@ -11809,7 +11809,7 @@ pub unsafe fn vst2_u64(a: *mut u64, b: uint64x1x2_t) { #[target_feature(enable = "neon,aes")] #[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] #[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] pub unsafe fn vst2_p64(a: *mut p64, b: poly64x1x2_t) { transmute(vst2_s64(transmute(a), transmute(b))) } diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 8c1810ef3d..1393c68184 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2826,6 +2826,7 @@ link-arm = vst2._EXTpi8r_ generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void generate *mut i8:int8x16x2_t:void, *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void arm = nop +aarch64 = nop generate *mut i64:int64x1x2_t:void /// Store multiple 2-element structures from two registers @@ -2847,6 +2848,7 @@ generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2 generate *mut u8:uint8x16x2_t:void, *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p8:poly8x16x2_t:void, *mut p16:poly16x8x2_t:void arm = nop +aarch64 = nop generate *mut u64:uint64x1x2_t:void target = aes generate *mut p64:poly64x1x2_t:void @@ -2859,9 +2861,11 @@ validate 1., 2., 2., 3., 2., 4., 3., 5., 2., 6., 3., 7., 4., 8., 5., 9. store_fn arm-aarch64-separate -aarch64 = st2 +aarch64 = st1 link-aarch64 = st2._EXTpi8_ -generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void +generate *mut f64:float64x1x2_t:void +aarch64 = st2 +generate *mut f64:float64x2x2_t:void arm = vst2 link-arm = vst2._EXTpi8r_ From d3d736560596b99510866004bd14c418d169ebfe Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:36:39 +0800 Subject: [PATCH 24/28] add vst3 and vst4 neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 200 +++ .../src/arm_shared/neon/generated.rs | 1122 +++++++++++++++++ crates/stdarch-gen/neon.spec | 60 +- 3 files changed, 1360 insertions(+), 22 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 75bb668f7f..53fa330947 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5480,6 +5480,116 @@ pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) { vst2q_f64_(b.0, b.1, a.cast()) } +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3))] +pub unsafe fn vst3q_s64(a: *mut i64, b: int64x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2i64.p0i8")] + fn vst3q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, ptr: *mut i8); + } + vst3q_s64_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3))] +pub unsafe fn vst3q_u64(a: *mut u64, b: uint64x2x3_t) { + transmute(vst3q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st3))] +pub unsafe fn vst3q_p64(a: *mut p64, b: poly64x2x3_t) { + transmute(vst3q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3))] +pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v1f64.p0i8")] + fn vst3_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, ptr: *mut i8); + } + vst3_f64_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(nop))] +pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2f64.p0i8")] + fn vst3q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, ptr: *mut i8); + } + vst3q_f64_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4))] +pub unsafe fn vst4q_s64(a: *mut i64, b: int64x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2i64.p0i8")] + fn vst4q_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, ptr: *mut i8); + } + vst4q_s64_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4))] +pub unsafe fn vst4q_u64(a: *mut u64, b: uint64x2x4_t) { + transmute(vst4q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st4))] +pub unsafe fn vst4q_p64(a: *mut p64, b: poly64x2x4_t) { + transmute(vst4q_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4))] +pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v1f64.p0i8")] + fn vst4_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, ptr: *mut i8); + } + vst4_f64_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(nop))] +pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f64.p0i8")] + fn vst4q_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, ptr: *mut i8); + } + vst4q_f64_(b.0, b.1, b.2, b.3, a.cast()) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -14525,6 +14635,96 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_s64() { + let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [i64; 6] = [1, 2, 2, 2, 4, 4]; + let mut r: [i64; 6] = [0i64; 6]; + vst3q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_u64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u64; 6] = [1, 2, 2, 2, 4, 4]; + let mut r: [u64; 6] = [0u64; 6]; + vst3q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_p64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u64; 6] = [1, 2, 2, 2, 4, 4]; + let mut r: [u64; 6] = [0u64; 6]; + vst3q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_f64() { + let a: [f64; 4] = [0., 1., 2., 2.]; + let e: [f64; 3] = [1., 2., 2.]; + let mut r: [f64; 3] = [0f64; 3]; + vst3_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_f64() { + let a: [f64; 7] = [0., 1., 2., 2., 4., 2., 4.]; + let e: [f64; 6] = [1., 2., 2., 2., 4., 4.]; + let mut r: [f64; 6] = [0f64; 6]; + vst3q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_s64() { + let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i64; 8] = [1, 2, 2, 6, 2, 6, 6, 8]; + let mut r: [i64; 8] = [0i64; 8]; + vst4q_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_u64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8]; + let mut r: [u64; 8] = [0u64; 8]; + vst4q_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_p64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u64; 8] = [1, 2, 2, 6, 2, 6, 6, 8]; + let mut r: [u64; 8] = [0u64; 8]; + vst4q_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 6.]; + let e: [f64; 4] = [1., 2., 2., 6.]; + let mut r: [f64; 4] = [0f64; 4]; + vst4_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_f64() { + let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f64; 8] = [1., 2., 2., 6., 2., 6., 6., 8.]; + let mut r: [f64; 8] = [0f64; 8]; + vst4q_f64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 4265a5138d..2930f27cc5 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11870,6 +11870,750 @@ pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) { vst2q_f32_(b.0, b.1, a.cast()) } +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v8i8")] + fn vst3_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, size: i32); + } +vst3_s8_(a.cast(), b.0, b.1, b.2, 1) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_s8(a: *mut i8, b: int8x8x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v8i8.p0i8")] + fn vst3_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, ptr: *mut i8); + } +vst3_s8_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4i16")] + fn vst3_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, size: i32); + } +vst3_s16_(a.cast(), b.0, b.1, b.2, 2) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_s16(a: *mut i16, b: int16x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4i16.p0i8")] + fn vst3_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, ptr: *mut i8); + } +vst3_s16_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v2i32")] + fn vst3_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, size: i32); + } +vst3_s32_(a.cast(), b.0, b.1, b.2, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_s32(a: *mut i32, b: int32x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2i32.p0i8")] + fn vst3_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, ptr: *mut i8); + } +vst3_s32_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v16i8")] + fn vst3q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, size: i32); + } +vst3q_s8_(a.cast(), b.0, b.1, b.2, 1) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_s8(a: *mut i8, b: int8x16x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v16i8.p0i8")] + fn vst3q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, ptr: *mut i8); + } +vst3q_s8_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v8i16")] + fn vst3q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, size: i32); + } +vst3q_s16_(a.cast(), b.0, b.1, b.2, 2) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_s16(a: *mut i16, b: int16x8x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v8i16.p0i8")] + fn vst3q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, ptr: *mut i8); + } +vst3q_s16_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4i32")] + fn vst3q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, size: i32); + } +vst3q_s32_(a.cast(), b.0, b.1, b.2, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_s32(a: *mut i32, b: int32x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4i32.p0i8")] + fn vst3q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, ptr: *mut i8); + } +vst3q_s32_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v1i64")] + fn vst3_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, size: i32); + } +vst3_s64_(a.cast(), b.0, b.1, b.2, 8) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst3_s64(a: *mut i64, b: int64x1x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v1i64.p0i8")] + fn vst3_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, ptr: *mut i8); + } +vst3_s64_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_u8(a: *mut u8, b: uint8x8x3_t) { + transmute(vst3_s8(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_u16(a: *mut u16, b: uint16x4x3_t) { + transmute(vst3_s16(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_u32(a: *mut u32, b: uint32x2x3_t) { + transmute(vst3_s32(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_u8(a: *mut u8, b: uint8x16x3_t) { + transmute(vst3q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_u16(a: *mut u16, b: uint16x8x3_t) { + transmute(vst3q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_u32(a: *mut u32, b: uint32x4x3_t) { + transmute(vst3q_s32(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_p8(a: *mut p8, b: poly8x8x3_t) { + transmute(vst3_s8(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3_p16(a: *mut p16, b: poly16x4x3_t) { + transmute(vst3_s16(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_p8(a: *mut p8, b: poly8x16x3_t) { + transmute(vst3q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] +pub unsafe fn vst3q_p16(a: *mut p16, b: poly16x8x3_t) { + transmute(vst3q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst3_u64(a: *mut u64, b: uint64x1x3_t) { + transmute(vst3_s64(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst3_p64(a: *mut p64, b: poly64x1x3_t) { + transmute(vst3_s64(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v2f32")] + fn vst3_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, size: i32); + } +vst3_f32_(a.cast(), b.0, b.1, b.2, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v2f32.p0i8")] + fn vst3_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, ptr: *mut i8); + } +vst3_f32_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3))] +pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3.p0i8.v4f32")] + fn vst3q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, size: i32); + } +vst3q_f32_(a.cast(), b.0, b.1, b.2, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3.v4f32.p0i8")] + fn vst3q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, ptr: *mut i8); + } +vst3q_f32_(b.0, b.1, b.2, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v8i8")] + fn vst4_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, size: i32); + } +vst4_s8_(a.cast(), b.0, b.1, b.2, b.3, 1) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_s8(a: *mut i8, b: int8x8x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v8i8.p0i8")] + fn vst4_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, ptr: *mut i8); + } +vst4_s8_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4i16")] + fn vst4_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, size: i32); + } +vst4_s16_(a.cast(), b.0, b.1, b.2, b.3, 2) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_s16(a: *mut i16, b: int16x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4i16.p0i8")] + fn vst4_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, ptr: *mut i8); + } +vst4_s16_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2i32")] + fn vst4_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, size: i32); + } +vst4_s32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_s32(a: *mut i32, b: int32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2i32.p0i8")] + fn vst4_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, ptr: *mut i8); + } +vst4_s32_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v16i8")] + fn vst4q_s8_(ptr: *mut i8, a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, size: i32); + } +vst4q_s8_(a.cast(), b.0, b.1, b.2, b.3, 1) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_s8(a: *mut i8, b: int8x16x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v16i8.p0i8")] + fn vst4q_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, ptr: *mut i8); + } +vst4q_s8_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v8i16")] + fn vst4q_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, size: i32); + } +vst4q_s16_(a.cast(), b.0, b.1, b.2, b.3, 2) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_s16(a: *mut i16, b: int16x8x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v8i16.p0i8")] + fn vst4q_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, ptr: *mut i8); + } +vst4q_s16_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4i32")] + fn vst4q_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, size: i32); + } +vst4q_s32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_s32(a: *mut i32, b: int32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4i32.p0i8")] + fn vst4q_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, ptr: *mut i8); + } +vst4q_s32_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v1i64")] + fn vst4_s64_(ptr: *mut i8, a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, size: i32); + } +vst4_s64_(a.cast(), b.0, b.1, b.2, b.3, 8) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_s64(a: *mut i64, b: int64x1x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v1i64.p0i8")] + fn vst4_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, ptr: *mut i8); + } +vst4_s64_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_u8(a: *mut u8, b: uint8x8x4_t) { + transmute(vst4_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_u16(a: *mut u16, b: uint16x4x4_t) { + transmute(vst4_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_u32(a: *mut u32, b: uint32x2x4_t) { + transmute(vst4_s32(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_u8(a: *mut u8, b: uint8x16x4_t) { + transmute(vst4q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_u16(a: *mut u16, b: uint16x8x4_t) { + transmute(vst4q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t) { + transmute(vst4q_s32(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t) { + transmute(vst4_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t) { + transmute(vst4_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t) { + transmute(vst4q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t) { + transmute(vst4q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t) { + transmute(vst4_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) { + transmute(vst4_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2f32")] + fn vst4_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, size: i32); + } +vst4_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f32.p0i8")] + fn vst4_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8); + } +vst4_f32_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4f32")] + fn vst4q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, size: i32); + } +vst4q_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4f32.p0i8")] + fn vst4q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8); + } +vst4q_f32_(b.0, b.1, b.2, b.3, a.cast()) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -28015,6 +28759,384 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst3_s8() { + let a: [i8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [i8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [i8; 24] = [0i8; 24]; + vst3_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_s16() { + let a: [i16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [i16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let mut r: [i16; 12] = [0i16; 12]; + vst3_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_s32() { + let a: [i32; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [i32; 6] = [1, 2, 2, 2, 4, 4]; + let mut r: [i32; 6] = [0i32; 6]; + vst3_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_s8() { + let a: [i8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [i8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let mut r: [i8; 48] = [0i8; 48]; + vst3q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_s16() { + let a: [i16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [i16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [i16; 24] = [0i16; 24]; + vst3q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_s32() { + let a: [i32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [i32; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let mut r: [i32; 12] = [0i32; 12]; + vst3q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_s64() { + let a: [i64; 4] = [0, 1, 2, 2]; + let e: [i64; 3] = [1, 2, 2]; + let mut r: [i64; 3] = [0i64; 3]; + vst3_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_u8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [u8; 24] = [0u8; 24]; + vst3_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_u16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let mut r: [u16; 12] = [0u16; 12]; + vst3_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_u32() { + let a: [u32; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u32; 6] = [1, 2, 2, 2, 4, 4]; + let mut r: [u32; 6] = [0u32; 6]; + vst3_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_u8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [u8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let mut r: [u8; 48] = [0u8; 48]; + vst3q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_u16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [u16; 24] = [0u16; 24]; + vst3q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_u32() { + let a: [u32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u32; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let mut r: [u32; 12] = [0u32; 12]; + vst3q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_p8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u8; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [u8; 24] = [0u8; 24]; + vst3_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_p16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u16; 12] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8]; + let mut r: [u16; 12] = [0u16; 12]; + vst3_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_p8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [u8; 48] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16, 2, 25, 41, 4, 26, 42, 7, 27, 43, 8, 28, 44, 13, 29, 45, 14, 30, 46, 15, 31, 47, 16, 32, 48]; + let mut r: [u8; 48] = [0u8; 48]; + vst3q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_p16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u16; 24] = [1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, 16, 16]; + let mut r: [u16; 24] = [0u16; 24]; + vst3q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_u64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64; 3] = [1, 2, 2]; + let mut r: [u64; 3] = [0u64; 3]; + vst3_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_p64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64; 3] = [1, 2, 2]; + let mut r: [u64; 3] = [0u64; 3]; + vst3_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_f32() { + let a: [f32; 7] = [0., 1., 2., 2., 4., 2., 4.]; + let e: [f32; 6] = [1., 2., 2., 2., 4., 4.]; + let mut r: [f32; 6] = [0f32; 6]; + vst3_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_f32() { + let a: [f32; 13] = [0., 1., 2., 2., 4., 2., 4., 7., 8., 2., 4., 7., 8.]; + let e: [f32; 12] = [1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8.]; + let mut r: [f32; 12] = [0f32; 12]; + vst3q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [i8; 32] = [0i8; 32]; + vst4_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let mut r: [i16; 16] = [0i16; 16]; + vst4_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i32; 8] = [1, 2, 2, 6, 2, 6, 6, 8]; + let mut r: [i32; 8] = [0i32; 8]; + vst4_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_s8() { + let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [i8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let mut r: [i8; 64] = [0i8; 64]; + vst4q_s8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_s16() { + let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [i16; 32] = [0i16; 32]; + vst4q_s16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_s32() { + let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i32; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let mut r: [i32; 16] = [0i32; 16]; + vst4q_s32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 6]; + let e: [i64; 4] = [1, 2, 2, 6]; + let mut r: [i64; 4] = [0i64; 4]; + vst4_s64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [u8; 32] = [0u8; 32]; + vst4_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let mut r: [u16; 16] = [0u16; 16]; + vst4_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u32; 8] = [1, 2, 2, 6, 2, 6, 6, 8]; + let mut r: [u32; 8] = [0u32; 8]; + vst4_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_u8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [u8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let mut r: [u8; 64] = [0u8; 64]; + vst4q_u8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_u16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [u16; 32] = [0u16; 32]; + vst4q_u16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_u32() { + let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u32; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let mut r: [u32; 16] = [0u32; 16]; + vst4q_u32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u8; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [u8; 32] = [0u8; 32]; + vst4_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u16; 16] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let mut r: [u16; 16] = [0u16; 16]; + vst4_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_p8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [u8; 64] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let mut r: [u8; 64] = [0u8; 64]; + vst4q_p8(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_p16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u16; 32] = [1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let mut r: [u16; 32] = [0u16; 32]; + vst4q_p16(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64; 4] = [1, 2, 2, 6]; + let mut r: [u64; 4] = [0u64; 4]; + vst4_u64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64; 4] = [1, 2, 2, 6]; + let mut r: [u64; 4] = [0u64; 4]; + vst4_p64(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f32; 8] = [1., 2., 2., 6., 2., 6., 6., 8.]; + let mut r: [f32; 8] = [0f32; 8]; + vst4_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_f32() { + let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.]; + let e: [f32; 16] = [1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.]; + let mut r: [f32; 16] = [0f32; 16]; + vst4q_f32(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_s8() { let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 1393c68184..a57993a001 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2946,12 +2946,15 @@ arm-aarch64-separate aarch64 = st3 link-aarch64 = st3._EXTpi8_ -//generate *mut i64:int64x2x3_t:void +generate *mut i64:int64x2x3_t:void arm = vst3 link-arm = vst3._EXTpi8r_ -//generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void, *mut i64:int64x1x3_t:void -//generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void +generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void +generate *mut i8:int8x16x3_t:void, *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void +arm = nop +aarch64 = nop +generate *mut i64:int64x1x3_t:void /// Store multiple 3-element structures from three registers name = vst3 @@ -2962,17 +2965,20 @@ validate 1, 2, 2, 2, 4, 4, 2, 7, 7, 4, 8, 8, 2, 13, 13, 4, 14, 14, 7, 15, 15, 8, store_fn aarch64 = st3 -//generate *mut u64:uint64x2x3_t:void +generate *mut u64:uint64x2x3_t:void target = aes -//generate *mut p64:poly64x2x3_t:void +generate *mut p64:poly64x2x3_t:void target = default arm = vst3 -//generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void, *mut u64:uint64x1x3_t:void -//generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void -//generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void +generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void +generate *mut u8:uint8x16x3_t:void, *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void +generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p8:poly8x16x3_t:void, *mut p16:poly16x8x3_t:void +arm = nop +aarch64 = nop +generate *mut u64:uint64x1x3_t:void target = aes -//generate *mut p64:poly64x1x3_t:void +generate *mut p64:poly64x1x3_t:void /// Store multiple 3-element structures from three registers name = vst3 @@ -2984,11 +2990,13 @@ arm-aarch64-separate aarch64 = st3 link-aarch64 = st3._EXTpi8_ -//generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void +generate *mut f64:float64x1x3_t:void +aarch64 = nop +generate *mut f64:float64x2x3_t:void arm = vst3 link-arm = vst3._EXTpi8r_ -//generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void +generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void /// Store multiple 3-element structures from three registers name = vst3 @@ -3065,12 +3073,15 @@ arm-aarch64-separate aarch64 = st4 link-aarch64 = st4._EXTpi8_ -//generate *mut i64:int64x2x4_t:void +generate *mut i64:int64x2x4_t:void arm = vst4 link-arm = vst4._EXTpi8r_ -//generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void, *mut i64:int64x1x4_t:void -//generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void +generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void +generate *mut i8:int8x16x4_t:void, *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void +arm = nop +aarch64 = nop +generate *mut i64:int64x1x4_t:void /// Store multiple 4-element structures from four registers name = vst4 @@ -3081,17 +3092,20 @@ validate 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 1 store_fn aarch64 = st4 -//generate *mut u64:uint64x2x4_t:void +generate *mut u64:uint64x2x4_t:void target = aes -//generate *mut p64:poly64x2x4_t:void +generate *mut p64:poly64x2x4_t:void target = default arm = vst4 -//generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void, *mut u64:uint64x1x4_t:void -//generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void -//generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void +generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void +generate *mut u8:uint8x16x4_t:void, *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void +generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p8:poly8x16x4_t:void, *mut p16:poly16x8x4_t:void +arm = nop +aarch64 = nop +generate *mut u64:uint64x1x4_t:void target = aes -//generate *mut p64:poly64x1x4_t:void +generate *mut p64:poly64x1x4_t:void /// Store multiple 4-element structures from four registers name = vst4 @@ -3103,11 +3117,13 @@ arm-aarch64-separate aarch64 = st4 link-aarch64 = st4._EXTpi8_ -//generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void +generate *mut f64:float64x1x4_t:void +aarch64 = nop +generate *mut f64:float64x2x4_t:void arm = vst4 link-arm = vst4._EXTpi8r_ -//generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void +generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Store multiple 4-element structures from four registers name = vst4 From 1807deeeb9b2d826d8573d9430307f1b577cb07e Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:44:13 +0800 Subject: [PATCH 25/28] change instr limit --- crates/stdarch-test/src/lib.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/crates/stdarch-test/src/lib.rs b/crates/stdarch-test/src/lib.rs index cc7607bac6..078736c66a 100644 --- a/crates/stdarch-test/src/lib.rs +++ b/crates/stdarch-test/src/lib.rs @@ -138,6 +138,9 @@ pub fn assert(shim_addr: usize, fnname: &str, expected: &str) { // core_arch/src/arm_shared/simd32 // vst1q_s64_x4_vst1 : #instructions = 40 >= 22 (limit) "vst1" => 41, + // core_arch/src/arm_shared/simd32 + // vst4q_u32_vst4 : #instructions = 26 >= 22 (limit) + "vst4" => 27, // Temporary, currently the fptosi.sat and fptoui.sat LLVM // intrinsics emit unnecessary code on arm. This can be From 67e28c48e72527a4a94cd25eb3833b9351c8d266 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:50:37 +0800 Subject: [PATCH 26/28] correct assert_instr --- crates/core_arch/src/aarch64/neon/generated.rs | 8 ++++---- crates/core_arch/src/arm_shared/neon/generated.rs | 8 ++++---- crates/stdarch-gen/neon.spec | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 53fa330947..eda1db79b7 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5512,7 +5512,7 @@ pub unsafe fn vst3q_p64(a: *mut p64, b: poly64x2x3_t) { /// Store multiple 3-element structures from three registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(st3))] +#[cfg_attr(test, assert_instr(nop))] pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -5525,7 +5525,7 @@ pub unsafe fn vst3_f64(a: *mut f64, b: float64x1x3_t) { /// Store multiple 3-element structures from three registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[cfg_attr(test, assert_instr(st3))] pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -5567,7 +5567,7 @@ pub unsafe fn vst4q_p64(a: *mut p64, b: poly64x2x4_t) { /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(st4))] +#[cfg_attr(test, assert_instr(nop))] pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -5580,7 +5580,7 @@ pub unsafe fn vst4_f64(a: *mut f64, b: float64x1x4_t) { /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] -#[cfg_attr(test, assert_instr(nop))] +#[cfg_attr(test, assert_instr(st4))] pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 2930f27cc5..0c16093295 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -12204,7 +12204,7 @@ vst3_f32_(a.cast(), b.0, b.1, b.2, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] pub unsafe fn vst3_f32(a: *mut f32, b: float32x2x3_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -12232,7 +12232,7 @@ vst3q_f32_(a.cast(), b.0, b.1, b.2, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3))] pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -12576,7 +12576,7 @@ vst4_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { #[allow(improper_ctypes)] extern "unadjusted" { @@ -12604,7 +12604,7 @@ vst4q_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { #[allow(improper_ctypes)] extern "unadjusted" { diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index a57993a001..451a2eaceb 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2988,10 +2988,10 @@ validate 1., 2., 2., 2., 4., 4., 2., 7., 7., 4., 8., 8., 2., 13., 13., 4. store_fn arm-aarch64-separate -aarch64 = st3 +aarch64 = nop link-aarch64 = st3._EXTpi8_ generate *mut f64:float64x1x3_t:void -aarch64 = nop +aarch64 = st3 generate *mut f64:float64x2x3_t:void arm = vst3 @@ -3115,10 +3115,10 @@ validate 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16. store_fn arm-aarch64-separate -aarch64 = st4 +aarch64 = nop link-aarch64 = st4._EXTpi8_ generate *mut f64:float64x1x4_t:void -aarch64 = nop +aarch64 = st4 generate *mut f64:float64x2x4_t:void arm = vst4 From 9afe630078a329bbecedabe1b0808b9f27077527 Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 17:59:27 +0800 Subject: [PATCH 27/28] add vst2_lane neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 234 +++++++++ .../src/arm_shared/neon/generated.rs | 455 ++++++++++++++++++ crates/stdarch-gen/neon.spec | 32 +- 3 files changed, 705 insertions(+), 16 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index eda1db79b7..21fe7a3c35 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5480,6 +5480,141 @@ pub unsafe fn vst2q_f64(a: *mut f64, b: float64x2x2_t) { vst2q_f64_(b.0, b.1, a.cast()) } +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s8(a: *mut i8, b: int8x16x2_t) { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v16i8.p0i8")] + fn vst2q_lane_s8_(a: int8x16_t, b: int8x16_t, n: i64, ptr: *mut i8); + } + vst2q_lane_s8_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s64(a: *mut i64, b: int64x1x2_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1i64.p0i8")] + fn vst2_lane_s64_(a: int64x1_t, b: int64x1_t, n: i64, ptr: *mut i8); + } + vst2_lane_s64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s64(a: *mut i64, b: int64x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2i64.p0i8")] + fn vst2q_lane_s64_(a: int64x2_t, b: int64x2_t, n: i64, ptr: *mut i8); + } + vst2q_lane_s64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_u8(a: *mut u8, b: uint8x16x2_t) { + static_assert_imm4!(LANE); + transmute(vst2q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_u64(a: *mut u64, b: uint64x1x2_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst2_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_u64(a: *mut u64, b: uint64x2x2_t) { + static_assert_imm1!(LANE); + transmute(vst2q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_p8(a: *mut p8, b: poly8x16x2_t) { + static_assert_imm4!(LANE); + transmute(vst2q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_p64(a: *mut p64, b: poly64x1x2_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst2_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_p64(a: *mut p64, b: poly64x2x2_t) { + static_assert_imm1!(LANE); + transmute(vst2q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_f64(a: *mut f64, b: float64x1x2_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v1f64.p0i8")] + fn vst2_lane_f64_(a: float64x1_t, b: float64x1_t, n: i64, ptr: *mut i8); + } + vst2_lane_f64_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_f64(a: *mut f64, b: float64x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2f64.p0i8")] + fn vst2q_lane_f64_(a: float64x2_t, b: float64x2_t, n: i64, ptr: *mut i8); + } + vst2q_lane_f64_(b.0, b.1, LANE as i64, a.cast()) +} + /// Store multiple 3-element structures from three registers #[inline] #[target_feature(enable = "neon")] @@ -14635,6 +14770,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [i8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 32] = [0i8; 32]; + vst2q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_s64() { + let a: [i64; 3] = [0, 1, 2]; + let e: [i64; 2] = [1, 2]; + let mut r: [i64; 2] = [0i64; 2]; + vst2_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 3]; + let e: [i64; 4] = [1, 2, 0, 0]; + let mut r: [i64; 4] = [0i64; 4]; + vst2q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 32] = [0u8; 32]; + vst2q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_u64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 2]; + let mut r: [u64; 2] = [0u64; 2]; + vst2_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [u64; 4] = [1, 2, 0, 0]; + let mut r: [u64; 4] = [0u64; 4]; + vst2q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]; + let e: [u8; 32] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 32] = [0u8; 32]; + vst2q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_p64() { + let a: [u64; 3] = [0, 1, 2]; + let e: [u64; 2] = [1, 2]; + let mut r: [u64; 2] = [0u64; 2]; + vst2_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 3]; + let e: [u64; 4] = [1, 2, 0, 0]; + let mut r: [u64; 4] = [0u64; 4]; + vst2q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_f64() { + let a: [f64; 3] = [0., 1., 2.]; + let e: [f64; 2] = [1., 2.]; + let mut r: [f64; 2] = [0f64; 2]; + vst2_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 3.]; + let e: [f64; 4] = [1., 2., 0., 0.]; + let mut r: [f64; 4] = [0f64; 4]; + vst2q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst3q_s64() { let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4]; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 0c16093295..1cd9aa5520 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -11870,6 +11870,326 @@ pub unsafe fn vst2q_f32(a: *mut f32, b: float32x4x2_t) { vst2q_f32_(b.0, b.1, a.cast()) } +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s8(a: *mut i8, b: int8x8x2_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v8i8")] + fn vst2_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, n: i32, size: i32); + } +vst2_lane_s8_(a.cast(), b.0, b.1, LANE, 1) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s8(a: *mut i8, b: int8x8x2_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v8i8.p0i8")] + fn vst2_lane_s8_(a: int8x8_t, b: int8x8_t, n: i64, ptr: *mut i8); + } +vst2_lane_s8_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s16(a: *mut i16, b: int16x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4i16")] + fn vst2_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, n: i32, size: i32); + } +vst2_lane_s16_(a.cast(), b.0, b.1, LANE, 2) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s16(a: *mut i16, b: int16x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4i16.p0i8")] + fn vst2_lane_s16_(a: int16x4_t, b: int16x4_t, n: i64, ptr: *mut i8); + } +vst2_lane_s16_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s32(a: *mut i32, b: int32x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v2i32")] + fn vst2_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, n: i32, size: i32); + } +vst2_lane_s32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_s32(a: *mut i32, b: int32x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2i32.p0i8")] + fn vst2_lane_s32_(a: int32x2_t, b: int32x2_t, n: i64, ptr: *mut i8); + } +vst2_lane_s32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s16(a: *mut i16, b: int16x8x2_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v8i16")] + fn vst2q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, n: i32, size: i32); + } +vst2q_lane_s16_(a.cast(), b.0, b.1, LANE, 2) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s16(a: *mut i16, b: int16x8x2_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v8i16.p0i8")] + fn vst2q_lane_s16_(a: int16x8_t, b: int16x8_t, n: i64, ptr: *mut i8); + } +vst2q_lane_s16_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s32(a: *mut i32, b: int32x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4i32")] + fn vst2q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, n: i32, size: i32); + } +vst2q_lane_s32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_s32(a: *mut i32, b: int32x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4i32.p0i8")] + fn vst2q_lane_s32_(a: int32x4_t, b: int32x4_t, n: i64, ptr: *mut i8); + } +vst2q_lane_s32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_u8(a: *mut u8, b: uint8x8x2_t) { + static_assert_imm3!(LANE); + transmute(vst2_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_u16(a: *mut u16, b: uint16x4x2_t) { + static_assert_imm2!(LANE); + transmute(vst2_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_u32(a: *mut u32, b: uint32x2x2_t) { + static_assert_imm1!(LANE); + transmute(vst2_lane_s32::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_u16(a: *mut u16, b: uint16x8x2_t) { + static_assert_imm3!(LANE); + transmute(vst2q_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_u32(a: *mut u32, b: uint32x4x2_t) { + static_assert_imm2!(LANE); + transmute(vst2q_lane_s32::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_p8(a: *mut p8, b: poly8x8x2_t) { + static_assert_imm3!(LANE); + transmute(vst2_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_p16(a: *mut p16, b: poly16x4x2_t) { + static_assert_imm2!(LANE); + transmute(vst2_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_p16(a: *mut p16, b: poly16x8x2_t) { + static_assert_imm3!(LANE); + transmute(vst2q_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_f32(a: *mut f32, b: float32x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v2f32")] + fn vst2_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, n: i32, size: i32); + } +vst2_lane_f32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2_lane_f32(a: *mut f32, b: float32x2x2_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v2f32.p0i8")] + fn vst2_lane_f32_(a: float32x2_t, b: float32x2_t, n: i64, ptr: *mut i8); + } +vst2_lane_f32_(b.0, b.1, LANE as i64, a.cast()) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_f32(a: *mut f32, b: float32x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst2lane.p0i8.v4f32")] + fn vst2q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, n: i32, size: i32); + } +vst2q_lane_f32_(a.cast(), b.0, b.1, LANE, 4) +} + +/// Store multiple 2-element structures from two registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st2, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst2q_lane_f32(a: *mut f32, b: float32x4x2_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st2lane.v4f32.p0i8")] + fn vst2q_lane_f32_(a: float32x4_t, b: float32x4_t, n: i64, ptr: *mut i8); + } +vst2q_lane_f32_(b.0, b.1, LANE as i64, a.cast()) +} + /// Store multiple 3-element structures from three registers #[inline] #[cfg(target_arch = "arm")] @@ -28759,6 +29079,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_s8() { + let a: [i8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [i8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 16] = [0i8; 16]; + vst2_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_s16() { + let a: [i16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [i16; 8] = [1, 2, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 8] = [0i16; 8]; + vst2_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_s32() { + let a: [i32; 5] = [0, 1, 2, 2, 3]; + let e: [i32; 4] = [1, 2, 0, 0]; + let mut r: [i32; 4] = [0i32; 4]; + vst2_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [i16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 16] = [0i16; 16]; + vst2q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [i32; 8] = [1, 2, 0, 0, 0, 0, 0, 0]; + let mut r: [i32; 8] = [0i32; 8]; + vst2q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_u8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 16] = [0u8; 16]; + vst2_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_u16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u16; 8] = [1, 2, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 8] = [0u16; 8]; + vst2_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_u32() { + let a: [u32; 5] = [0, 1, 2, 2, 3]; + let e: [u32; 4] = [1, 2, 0, 0]; + let mut r: [u32; 4] = [0u32; 4]; + vst2_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 16] = [0u16; 16]; + vst2q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u32; 8] = [1, 2, 0, 0, 0, 0, 0, 0]; + let mut r: [u32; 8] = [0u32; 8]; + vst2q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_p8() { + let a: [u8; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u8; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 16] = [0u8; 16]; + vst2_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_p16() { + let a: [u16; 9] = [0, 1, 2, 2, 3, 2, 3, 4, 5]; + let e: [u16; 8] = [1, 2, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 8] = [0u16; 8]; + vst2_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 3, 2, 3, 4, 5, 2, 3, 4, 5, 6, 7, 8, 9]; + let e: [u16; 16] = [1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 16] = [0u16; 16]; + vst2q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2_lane_f32() { + let a: [f32; 5] = [0., 1., 2., 2., 3.]; + let e: [f32; 4] = [1., 2., 0., 0.]; + let mut r: [f32; 4] = [0f32; 4]; + vst2_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst2q_lane_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 3., 2., 3., 4., 5.]; + let e: [f32; 8] = [1., 2., 0., 0., 0., 0., 0., 0.]; + let mut r: [f32; 8] = [0f32; 8]; + vst2q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst3_s8() { let a: [i8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 451a2eaceb..3b2dbe5b27 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -2882,16 +2882,16 @@ validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, store_fn arm-aarch64-separate -aarch64 = st2lane +aarch64 = st2 link-aarch64 = st2lane._EXTpi8_ const-aarch64 = LANE -//generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void +generate *mut i8:int8x16x2_t:void, *mut i64:int64x1x2_t:void, *mut i64:int64x2x2_t:void -arm = vst2lane +arm = vst2 link-arm = vst2lane._EXTpi8r_ const-arm = LANE -//generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void -//generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void +generate *mut i8:int8x8x2_t:void, *mut i16:int16x4x2_t:void, *mut i32:int32x2x2_t:void +generate *mut i16:int16x8x2_t:void, *mut i32:int32x4x2_t:void /// Store multiple 2-element structures from two registers name = vst2 @@ -2904,16 +2904,16 @@ n = 0 validate 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn -aarch64 = st2lane -//generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void +aarch64 = st2 +generate *mut u8:uint8x16x2_t:void, *mut u64:uint64x1x2_t:void, *mut u64:uint64x2x2_t:void, *mut p8:poly8x16x2_t:void target = aes -//generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void +generate *mut p64:poly64x1x2_t:void, *mut p64:poly64x2x2_t:void target = default -arm = vst2lane -//generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void -//generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void -//generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void +arm = vst2 +generate *mut u8:uint8x8x2_t:void, *mut u16:uint16x4x2_t:void, *mut u32:uint32x2x2_t:void +generate *mut u16:uint16x8x2_t:void, *mut u32:uint32x4x2_t:void +generate *mut p8:poly8x8x2_t:void, *mut p16:poly16x4x2_t:void, *mut p16:poly16x8x2_t:void /// Store multiple 2-element structures from two registers name = vst2 @@ -2926,15 +2926,15 @@ validate 1., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate -aarch64 = st2lane +aarch64 = st2 link-aarch64 = st2lane._EXTpi8_ const-aarch64 = LANE -//generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void +generate *mut f64:float64x1x2_t:void, *mut f64:float64x2x2_t:void -arm = vst2lane +arm = vst2 link-arm = vst2lane._EXTpi8r_ const-arm = LANE -//generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void +generate *mut f32:float32x2x2_t:void, *mut f32:float32x4x2_t:void /// Store multiple 3-element structures from three registers name = vst3 From 0c040dc56b162dbb4a574024a7716458108a10be Mon Sep 17 00:00:00 2001 From: SparrowLii Date: Sun, 26 Sep 2021 18:10:17 +0800 Subject: [PATCH 28/28] add vst3_lane and vst4_lane neon instrs --- .../core_arch/src/aarch64/neon/generated.rs | 468 ++++++++ .../src/arm_shared/neon/generated.rs | 1010 ++++++++++++++++- crates/stdarch-gen/neon.spec | 64 +- 3 files changed, 1460 insertions(+), 82 deletions(-) diff --git a/crates/core_arch/src/aarch64/neon/generated.rs b/crates/core_arch/src/aarch64/neon/generated.rs index 21fe7a3c35..9afba07021 100644 --- a/crates/core_arch/src/aarch64/neon/generated.rs +++ b/crates/core_arch/src/aarch64/neon/generated.rs @@ -5670,6 +5670,141 @@ pub unsafe fn vst3q_f64(a: *mut f64, b: float64x2x3_t) { vst3q_f64_(b.0, b.1, b.2, a.cast()) } +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s8(a: *mut i8, b: int8x16x3_t) { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v16i8.p0i8")] + fn vst3q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, n: i64, ptr: *mut i8); + } + vst3q_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s64(a: *mut i64, b: int64x1x3_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1i64.p0i8")] + fn vst3_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, n: i64, ptr: *mut i8); + } + vst3_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s64(a: *mut i64, b: int64x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2i64.p0i8")] + fn vst3q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, n: i64, ptr: *mut i8); + } + vst3q_lane_s64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_u8(a: *mut u8, b: uint8x16x3_t) { + static_assert_imm4!(LANE); + transmute(vst3q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_u64(a: *mut u64, b: uint64x1x3_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst3_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_u64(a: *mut u64, b: uint64x2x3_t) { + static_assert_imm1!(LANE); + transmute(vst3q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_p8(a: *mut p8, b: poly8x16x3_t) { + static_assert_imm4!(LANE); + transmute(vst3q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_p64(a: *mut p64, b: poly64x1x3_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst3_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_p64(a: *mut p64, b: poly64x2x3_t) { + static_assert_imm1!(LANE); + transmute(vst3q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_f64(a: *mut f64, b: float64x1x3_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v1f64.p0i8")] + fn vst3_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, n: i64, ptr: *mut i8); + } + vst3_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_f64(a: *mut f64, b: float64x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2f64.p0i8")] + fn vst3q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, n: i64, ptr: *mut i8); + } + vst3q_lane_f64_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] @@ -5725,6 +5860,141 @@ pub unsafe fn vst4q_f64(a: *mut f64, b: float64x2x4_t) { vst4q_f64_(b.0, b.1, b.2, b.3, a.cast()) } +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s8(a: *mut i8, b: int8x16x4_t) { + static_assert_imm4!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v16i8.p0i8")] + fn vst4q_lane_s8_(a: int8x16_t, b: int8x16_t, c: int8x16_t, d: int8x16_t, n: i64, ptr: *mut i8); + } + vst4q_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s64(a: *mut i64, b: int64x1x4_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1i64.p0i8")] + fn vst4_lane_s64_(a: int64x1_t, b: int64x1_t, c: int64x1_t, d: int64x1_t, n: i64, ptr: *mut i8); + } + vst4_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s64(a: *mut i64, b: int64x2x4_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2i64.p0i8")] + fn vst4q_lane_s64_(a: int64x2_t, b: int64x2_t, c: int64x2_t, d: int64x2_t, n: i64, ptr: *mut i8); + } + vst4q_lane_s64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_u8(a: *mut u8, b: uint8x16x4_t) { + static_assert_imm4!(LANE); + transmute(vst4q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_u64(a: *mut u64, b: uint64x1x4_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst4_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_u64(a: *mut u64, b: uint64x2x4_t) { + static_assert_imm1!(LANE); + transmute(vst4q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_p8(a: *mut p8, b: poly8x16x4_t) { + static_assert_imm4!(LANE); + transmute(vst4q_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_p64(a: *mut p64, b: poly64x1x4_t) { + static_assert!(LANE : i32 where LANE == 0); + transmute(vst4_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_p64(a: *mut p64, b: poly64x2x4_t) { + static_assert_imm1!(LANE); + transmute(vst4q_lane_s64::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_f64(a: *mut f64, b: float64x1x4_t) { + static_assert!(LANE : i32 where LANE == 0); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v1f64.p0i8")] + fn vst4_lane_f64_(a: float64x1_t, b: float64x1_t, c: float64x1_t, d: float64x1_t, n: i64, ptr: *mut i8); + } + vst4_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(test, assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_f64(a: *mut f64, b: float64x2x4_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2f64.p0i8")] + fn vst4q_lane_f64_(a: float64x2_t, b: float64x2_t, c: float64x2_t, d: float64x2_t, n: i64, ptr: *mut i8); + } + vst4q_lane_f64_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + /// Multiply #[inline] #[target_feature(enable = "neon")] @@ -14914,6 +15184,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_s8() { + let a: [i8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [i8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 48] = [0i8; 48]; + vst3q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_s64() { + let a: [i64; 4] = [0, 1, 2, 2]; + let e: [i64; 3] = [1, 2, 2]; + let mut r: [i64; 3] = [0i64; 3]; + vst3_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_s64() { + let a: [i64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [i64; 6] = [1, 2, 2, 0, 0, 0]; + let mut r: [i64; 6] = [0i64; 6]; + vst3q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_u8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 48] = [0u8; 48]; + vst3q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_u64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64; 3] = [1, 2, 2]; + let mut r: [u64; 3] = [0u64; 3]; + vst3_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_u64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u64; 6] = [1, 2, 2, 0, 0, 0]; + let mut r: [u64; 6] = [0u64; 6]; + vst3q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_p8() { + let a: [u8; 49] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16, 25, 26, 27, 28, 29, 30, 31, 32, 2, 4, 7, 8, 13, 14, 15, 16, 41, 42, 43, 44, 45, 46, 47, 48]; + let e: [u8; 48] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 48] = [0u8; 48]; + vst3q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_p64() { + let a: [u64; 4] = [0, 1, 2, 2]; + let e: [u64; 3] = [1, 2, 2]; + let mut r: [u64; 3] = [0u64; 3]; + vst3_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_p64() { + let a: [u64; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u64; 6] = [1, 2, 2, 0, 0, 0]; + let mut r: [u64; 6] = [0u64; 6]; + vst3q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_f64() { + let a: [f64; 4] = [0., 1., 2., 2.]; + let e: [f64; 3] = [1., 2., 2.]; + let mut r: [f64; 3] = [0f64; 3]; + vst3_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_f64() { + let a: [f64; 7] = [0., 1., 2., 2., 3., 2., 3.]; + let e: [f64; 6] = [1., 2., 2., 0., 0., 0.]; + let mut r: [f64; 6] = [0f64; 6]; + vst3q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst4q_s64() { let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; @@ -14959,6 +15328,105 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_s8() { + let a: [i8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [i8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 64] = [0i8; 64]; + vst4q_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_s64() { + let a: [i64; 5] = [0, 1, 2, 2, 6]; + let e: [i64; 4] = [1, 2, 2, 6]; + let mut r: [i64; 4] = [0i64; 4]; + vst4_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_s64() { + let a: [i64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i64; 8] = [1, 2, 2, 6, 0, 0, 0, 0]; + let mut r: [i64; 8] = [0i64; 8]; + vst4q_lane_s64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_u8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 64] = [0u8; 64]; + vst4q_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_u64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64; 4] = [1, 2, 2, 6]; + let mut r: [u64; 4] = [0u64; 4]; + vst4_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_u64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0]; + let mut r: [u64; 8] = [0u64; 8]; + vst4q_lane_u64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_p8() { + let a: [u8; 65] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 43, 44, 8, 16, 44, 48, 6, 8, 8, 16, 8, 16, 16, 32, 8, 16, 44, 48, 16, 32, 48, 64]; + let e: [u8; 64] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 64] = [0u8; 64]; + vst4q_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_p64() { + let a: [u64; 5] = [0, 1, 2, 2, 6]; + let e: [u64; 4] = [1, 2, 2, 6]; + let mut r: [u64; 4] = [0u64; 4]; + vst4_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_p64() { + let a: [u64; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u64; 8] = [1, 2, 2, 6, 0, 0, 0, 0]; + let mut r: [u64; 8] = [0u64; 8]; + vst4q_lane_p64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_f64() { + let a: [f64; 5] = [0., 1., 2., 2., 6.]; + let e: [f64; 4] = [1., 2., 2., 6.]; + let mut r: [f64; 4] = [0f64; 4]; + vst4_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_f64() { + let a: [f64; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f64; 8] = [1., 2., 2., 6., 0., 0., 0., 0.]; + let mut r: [f64; 8] = [0f64; 8]; + vst4q_lane_f64::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_f64() { let a: f64 = 1.0; diff --git a/crates/core_arch/src/arm_shared/neon/generated.rs b/crates/core_arch/src/arm_shared/neon/generated.rs index 1cd9aa5520..95972bd33c 100644 --- a/crates/core_arch/src/arm_shared/neon/generated.rs +++ b/crates/core_arch/src/arm_shared/neon/generated.rs @@ -12562,6 +12562,326 @@ pub unsafe fn vst3q_f32(a: *mut f32, b: float32x4x3_t) { vst3q_f32_(b.0, b.1, b.2, a.cast()) } +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s8(a: *mut i8, b: int8x8x3_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v8i8")] + fn vst3_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i32, size: i32); + } +vst3_lane_s8_(a.cast(), b.0, b.1, b.2, LANE, 1) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s8(a: *mut i8, b: int8x8x3_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v8i8.p0i8")] + fn vst3_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, n: i64, ptr: *mut i8); + } +vst3_lane_s8_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s16(a: *mut i16, b: int16x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4i16")] + fn vst3_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i32, size: i32); + } +vst3_lane_s16_(a.cast(), b.0, b.1, b.2, LANE, 2) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s16(a: *mut i16, b: int16x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4i16.p0i8")] + fn vst3_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, n: i64, ptr: *mut i8); + } +vst3_lane_s16_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s32(a: *mut i32, b: int32x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v2i32")] + fn vst3_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i32, size: i32); + } +vst3_lane_s32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_s32(a: *mut i32, b: int32x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2i32.p0i8")] + fn vst3_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, n: i64, ptr: *mut i8); + } +vst3_lane_s32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s16(a: *mut i16, b: int16x8x3_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v8i16")] + fn vst3q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i32, size: i32); + } +vst3q_lane_s16_(a.cast(), b.0, b.1, b.2, LANE, 2) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s16(a: *mut i16, b: int16x8x3_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v8i16.p0i8")] + fn vst3q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, n: i64, ptr: *mut i8); + } +vst3q_lane_s16_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s32(a: *mut i32, b: int32x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4i32")] + fn vst3q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i32, size: i32); + } +vst3q_lane_s32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_s32(a: *mut i32, b: int32x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4i32.p0i8")] + fn vst3q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, n: i64, ptr: *mut i8); + } +vst3q_lane_s32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_u8(a: *mut u8, b: uint8x8x3_t) { + static_assert_imm3!(LANE); + transmute(vst3_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_u16(a: *mut u16, b: uint16x4x3_t) { + static_assert_imm2!(LANE); + transmute(vst3_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_u32(a: *mut u32, b: uint32x2x3_t) { + static_assert_imm1!(LANE); + transmute(vst3_lane_s32::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_u16(a: *mut u16, b: uint16x8x3_t) { + static_assert_imm3!(LANE); + transmute(vst3q_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_u32(a: *mut u32, b: uint32x4x3_t) { + static_assert_imm2!(LANE); + transmute(vst3q_lane_s32::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_p8(a: *mut p8, b: poly8x8x3_t) { + static_assert_imm3!(LANE); + transmute(vst3_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_p16(a: *mut p16, b: poly16x4x3_t) { + static_assert_imm2!(LANE); + transmute(vst3_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_p16(a: *mut p16, b: poly16x8x3_t) { + static_assert_imm3!(LANE); + transmute(vst3q_lane_s16::(transmute(a), transmute(b))) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_f32(a: *mut f32, b: float32x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v2f32")] + fn vst3_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i32, size: i32); + } +vst3_lane_f32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3_lane_f32(a: *mut f32, b: float32x2x3_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v2f32.p0i8")] + fn vst3_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, n: i64, ptr: *mut i8); + } +vst3_lane_f32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_f32(a: *mut f32, b: float32x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst3lane.p0i8.v4f32")] + fn vst3q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i32, size: i32); + } +vst3q_lane_f32_(a.cast(), b.0, b.1, b.2, LANE, 4) +} + +/// Store multiple 3-element structures from three registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st3, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst3q_lane_f32(a: *mut f32, b: float32x4x3_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st3lane.v4f32.p0i8")] + fn vst3q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, n: i64, ptr: *mut i8); + } +vst3q_lane_f32_(b.0, b.1, b.2, LANE as i64, a.cast()) +} + /// Store multiple 4-element structures from four registers #[inline] #[cfg(target_arch = "arm")] @@ -12812,126 +13132,446 @@ pub unsafe fn vst4q_u16(a: *mut u16, b: uint16x8x4_t) { #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t) { - transmute(vst4q_s32(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_u32(a: *mut u32, b: uint32x4x4_t) { + transmute(vst4q_s32(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t) { + transmute(vst4_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t) { + transmute(vst4_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t) { + transmute(vst4q_s8(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t) { + transmute(vst4q_s16(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t) { + transmute(vst4_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon,aes")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] +pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) { + transmute(vst4_s64(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2f32")] + fn vst4_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, size: i32); + } +vst4_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f32.p0i8")] + fn vst4_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8); + } +vst4_f32_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] +pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4f32")] + fn vst4q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, size: i32); + } +vst4q_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] +pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4f32.p0i8")] + fn vst4q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8); + } +vst4q_f32_(b.0, b.1, b.2, b.3, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s8(a: *mut i8, b: int8x8x4_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v8i8")] + fn vst4_lane_s8_(ptr: *mut i8, a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i32, size: i32); + } +vst4_lane_s8_(a.cast(), b.0, b.1, b.2, b.3, LANE, 1) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s8(a: *mut i8, b: int8x8x4_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v8i8.p0i8")] + fn vst4_lane_s8_(a: int8x8_t, b: int8x8_t, c: int8x8_t, d: int8x8_t, n: i64, ptr: *mut i8); + } +vst4_lane_s8_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s16(a: *mut i16, b: int16x4x4_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4i16")] + fn vst4_lane_s16_(ptr: *mut i8, a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i32, size: i32); + } +vst4_lane_s16_(a.cast(), b.0, b.1, b.2, b.3, LANE, 2) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s16(a: *mut i16, b: int16x4x4_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4i16.p0i8")] + fn vst4_lane_s16_(a: int16x4_t, b: int16x4_t, c: int16x4_t, d: int16x4_t, n: i64, ptr: *mut i8); + } +vst4_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s32(a: *mut i32, b: int32x2x4_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v2i32")] + fn vst4_lane_s32_(ptr: *mut i8, a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i32, size: i32); + } +vst4_lane_s32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_s32(a: *mut i32, b: int32x2x4_t) { + static_assert_imm1!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2i32.p0i8")] + fn vst4_lane_s32_(a: int32x2_t, b: int32x2_t, c: int32x2_t, d: int32x2_t, n: i64, ptr: *mut i8); + } +vst4_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s16(a: *mut i16, b: int16x8x4_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v8i16")] + fn vst4q_lane_s16_(ptr: *mut i8, a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i32, size: i32); + } +vst4q_lane_s16_(a.cast(), b.0, b.1, b.2, b.3, LANE, 2) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s16(a: *mut i16, b: int16x8x4_t) { + static_assert_imm3!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v8i16.p0i8")] + fn vst4q_lane_s16_(a: int16x8_t, b: int16x8_t, c: int16x8_t, d: int16x8_t, n: i64, ptr: *mut i8); + } +vst4q_lane_s16_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "arm")] +#[target_feature(enable = "neon,v7")] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s32(a: *mut i32, b: int32x4x4_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4i32")] + fn vst4q_lane_s32_(ptr: *mut i8, a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i32, size: i32); + } +vst4q_lane_s32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[cfg(target_arch = "aarch64")] +#[target_feature(enable = "neon")] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_s32(a: *mut i32, b: int32x4x4_t) { + static_assert_imm2!(LANE); + #[allow(improper_ctypes)] + extern "unadjusted" { + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4i32.p0i8")] + fn vst4q_lane_s32_(a: int32x4_t, b: int32x4_t, c: int32x4_t, d: int32x4_t, n: i64, ptr: *mut i8); + } +vst4q_lane_s32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_u8(a: *mut u8, b: uint8x8x4_t) { + static_assert_imm3!(LANE); + transmute(vst4_lane_s8::(transmute(a), transmute(b))) +} + +/// Store multiple 4-element structures from four registers +#[inline] +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_u16(a: *mut u16, b: uint16x4x4_t) { + static_assert_imm2!(LANE); + transmute(vst4_lane_s16::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4_p8(a: *mut p8, b: poly8x8x4_t) { - transmute(vst4_s8(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_u32(a: *mut u32, b: uint32x2x4_t) { + static_assert_imm1!(LANE); + transmute(vst4_lane_s32::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4_p16(a: *mut p16, b: poly16x4x4_t) { - transmute(vst4_s16(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_u16(a: *mut u16, b: uint16x8x4_t) { + static_assert_imm3!(LANE); + transmute(vst4q_lane_s16::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4q_p8(a: *mut p8, b: poly8x16x4_t) { - transmute(vst4q_s8(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_u32(a: *mut u32, b: uint32x4x4_t) { + static_assert_imm2!(LANE); + transmute(vst4q_lane_s32::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4q_p16(a: *mut p16, b: poly16x8x4_t) { - transmute(vst4q_s16(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_p8(a: *mut p8, b: poly8x8x4_t) { + static_assert_imm3!(LANE); + transmute(vst4_lane_s8::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[target_feature(enable = "neon")] #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] -pub unsafe fn vst4_u64(a: *mut u64, b: uint64x1x4_t) { - transmute(vst4_s64(transmute(a), transmute(b))) +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_p16(a: *mut p16, b: poly16x4x4_t) { + static_assert_imm2!(LANE); + transmute(vst4_lane_s16::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] -#[target_feature(enable = "neon,aes")] -#[cfg_attr(target_arch = "arm", target_feature(enable = "aes,v8"))] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(nop))] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(nop))] -pub unsafe fn vst4_p64(a: *mut p64, b: poly64x1x4_t) { - transmute(vst4_s64(transmute(a), transmute(b))) +#[target_feature(enable = "neon")] +#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))] +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_p16(a: *mut p16, b: poly16x8x4_t) { + static_assert_imm3!(LANE); + transmute(vst4q_lane_s16::(transmute(a), transmute(b))) } /// Store multiple 4-element structures from four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_f32(a: *mut f32, b: float32x2x4_t) { + static_assert_imm1!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v2f32")] - fn vst4_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, size: i32); + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v2f32")] + fn vst4_lane_f32_(ptr: *mut i8, a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i32, size: i32); } -vst4_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +vst4_lane_f32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) } /// Store multiple 4-element structures from four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4_f32(a: *mut f32, b: float32x2x4_t) { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4_lane_f32(a: *mut f32, b: float32x2x4_t) { + static_assert_imm1!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v2f32.p0i8")] - fn vst4_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, ptr: *mut i8); + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v2f32.p0i8")] + fn vst4_lane_f32_(a: float32x2_t, b: float32x2_t, c: float32x2_t, d: float32x2_t, n: i64, ptr: *mut i8); } -vst4_f32_(b.0, b.1, b.2, b.3, a.cast()) +vst4_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } /// Store multiple 4-element structures from four registers #[inline] #[cfg(target_arch = "arm")] #[target_feature(enable = "neon,v7")] -#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4))] -pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { +#[cfg_attr(all(test, target_arch = "arm"), assert_instr(vst4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_f32(a: *mut f32, b: float32x4x4_t) { + static_assert_imm2!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4.p0i8.v4f32")] - fn vst4q_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, size: i32); + #[cfg_attr(target_arch = "arm", link_name = "llvm.arm.neon.vst4lane.p0i8.v4f32")] + fn vst4q_lane_f32_(ptr: *mut i8, a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i32, size: i32); } -vst4q_f32_(a.cast(), b.0, b.1, b.2, b.3, 4) +vst4q_lane_f32_(a.cast(), b.0, b.1, b.2, b.3, LANE, 4) } /// Store multiple 4-element structures from four registers #[inline] #[cfg(target_arch = "aarch64")] #[target_feature(enable = "neon")] -#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4))] -pub unsafe fn vst4q_f32(a: *mut f32, b: float32x4x4_t) { +#[cfg_attr(all(test, target_arch = "aarch64"), assert_instr(st4, LANE = 0))] +#[rustc_legacy_const_generics(2)] +pub unsafe fn vst4q_lane_f32(a: *mut f32, b: float32x4x4_t) { + static_assert_imm2!(LANE); #[allow(improper_ctypes)] extern "unadjusted" { - #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4.v4f32.p0i8")] - fn vst4q_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, ptr: *mut i8); + #[cfg_attr(target_arch = "aarch64", link_name = "llvm.aarch64.neon.st4lane.v4f32.p0i8")] + fn vst4q_lane_f32_(a: float32x4_t, b: float32x4_t, c: float32x4_t, d: float32x4_t, n: i64, ptr: *mut i8); } -vst4q_f32_(b.0, b.1, b.2, b.3, a.cast()) +vst4q_lane_f32_(b.0, b.1, b.2, b.3, LANE as i64, a.cast()) } /// Multiply @@ -29403,6 +30043,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_s8() { + let a: [i8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [i8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 24] = [0i8; 24]; + vst3_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_s16() { + let a: [i16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [i16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 12] = [0i16; 12]; + vst3_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_s32() { + let a: [i32; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [i32; 6] = [1, 2, 2, 0, 0, 0]; + let mut r: [i32; 6] = [0i32; 6]; + vst3_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_s16() { + let a: [i16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [i16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 24] = [0i16; 24]; + vst3q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_s32() { + let a: [i32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [i32; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i32; 12] = [0i32; 12]; + vst3q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_u8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 24] = [0u8; 24]; + vst3_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_u16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 12] = [0u16; 12]; + vst3_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_u32() { + let a: [u32; 7] = [0, 1, 2, 2, 4, 2, 4]; + let e: [u32; 6] = [1, 2, 2, 0, 0, 0]; + let mut r: [u32; 6] = [0u32; 6]; + vst3_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_u16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 24] = [0u16; 24]; + vst3q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_u32() { + let a: [u32; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u32; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u32; 12] = [0u32; 12]; + vst3q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_p8() { + let a: [u8; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u8; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 24] = [0u8; 24]; + vst3_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_p16() { + let a: [u16; 13] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8]; + let e: [u16; 12] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 12] = [0u16; 12]; + vst3_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_p16() { + let a: [u16; 25] = [0, 1, 2, 2, 4, 2, 4, 7, 8, 2, 4, 7, 8, 13, 14, 15, 16, 2, 4, 7, 8, 13, 14, 15, 16]; + let e: [u16; 24] = [1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 24] = [0u16; 24]; + vst3q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3_lane_f32() { + let a: [f32; 7] = [0., 1., 2., 2., 3., 2., 3.]; + let e: [f32; 6] = [1., 2., 2., 0., 0., 0.]; + let mut r: [f32; 6] = [0f32; 6]; + vst3_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst3q_lane_f32() { + let a: [f32; 13] = [0., 1., 2., 2., 3., 2., 3., 4., 5., 2., 3., 4., 5.]; + let e: [f32; 12] = [1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0.]; + let mut r: [f32; 12] = [0f32; 12]; + vst3q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vst4_s8() { let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; @@ -29592,6 +30367,141 @@ mod test { assert_eq!(r, e); } + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_s8() { + let a: [i8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i8; 32] = [0i8; 32]; + vst4_lane_s8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_s16() { + let a: [i16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 16] = [0i16; 16]; + vst4_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_s32() { + let a: [i32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [i32; 8] = [1, 2, 2, 6, 0, 0, 0, 0]; + let mut r: [i32; 8] = [0i32; 8]; + vst4_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_s16() { + let a: [i16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [i16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i16; 32] = [0i16; 32]; + vst4q_lane_s16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_s32() { + let a: [i32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [i32; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [i32; 16] = [0i32; 16]; + vst4q_lane_s32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_u8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 32] = [0u8; 32]; + vst4_lane_u8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_u16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 16] = [0u16; 16]; + vst4_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_u32() { + let a: [u32; 9] = [0, 1, 2, 2, 6, 2, 6, 6, 8]; + let e: [u32; 8] = [1, 2, 2, 6, 0, 0, 0, 0]; + let mut r: [u32; 8] = [0u32; 8]; + vst4_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_u16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 32] = [0u16; 32]; + vst4q_lane_u16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_u32() { + let a: [u32; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u32; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u32; 16] = [0u32; 16]; + vst4q_lane_u32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_p8() { + let a: [u8; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u8; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u8; 32] = [0u8; 32]; + vst4_lane_p8::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_p16() { + let a: [u16; 17] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16]; + let e: [u16; 16] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 16] = [0u16; 16]; + vst4_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_p16() { + let a: [u16; 33] = [0, 1, 2, 2, 6, 2, 6, 6, 8, 2, 6, 6, 8, 6, 8, 8, 16, 2, 6, 6, 8, 6, 8, 8, 16, 6, 8, 8, 16, 8, 16, 16, 32]; + let e: [u16; 32] = [1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]; + let mut r: [u16; 32] = [0u16; 32]; + vst4q_lane_p16::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4_lane_f32() { + let a: [f32; 9] = [0., 1., 2., 2., 6., 2., 6., 6., 8.]; + let e: [f32; 8] = [1., 2., 2., 6., 0., 0., 0., 0.]; + let mut r: [f32; 8] = [0f32; 8]; + vst4_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + + #[simd_test(enable = "neon")] + unsafe fn test_vst4q_lane_f32() { + let a: [f32; 17] = [0., 1., 2., 2., 6., 2., 6., 6., 8., 2., 6., 6., 8., 6., 8., 8., 16.]; + let e: [f32; 16] = [1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]; + let mut r: [f32; 16] = [0f32; 16]; + vst4q_lane_f32::<0>(r.as_mut_ptr(), core::ptr::read_unaligned(a[1..].as_ptr().cast())); + assert_eq!(r, e); + } + #[simd_test(enable = "neon")] unsafe fn test_vmul_s8() { let a: i8x8 = i8x8::new(1, 2, 1, 2, 1, 2, 1, 2); diff --git a/crates/stdarch-gen/neon.spec b/crates/stdarch-gen/neon.spec index 3b2dbe5b27..20f6c3d0fd 100644 --- a/crates/stdarch-gen/neon.spec +++ b/crates/stdarch-gen/neon.spec @@ -3009,16 +3009,16 @@ validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, store_fn arm-aarch64-separate -aarch64 = st3lane +aarch64 = st3 link-aarch64 = st3lane._EXTpi8_ const-aarch64 = LANE -//generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void +generate *mut i8:int8x16x3_t:void, *mut i64:int64x1x3_t:void, *mut i64:int64x2x3_t:void -arm = vst3lane +arm = vst3 link-arm = vst3lane._EXTpi8r_ const-arm = LANE -//generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void -//generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void +generate *mut i8:int8x8x3_t:void, *mut i16:int16x4x3_t:void, *mut i32:int32x2x3_t:void +generate *mut i16:int16x8x3_t:void, *mut i32:int32x4x3_t:void /// Store multiple 3-element structures from three registers name = vst3 @@ -3031,16 +3031,16 @@ n = 0 validate 1, 2, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn -aarch64 = st3lane -//generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void +aarch64 = st3 +generate *mut u8:uint8x16x3_t:void, *mut u64:uint64x1x3_t:void, *mut u64:uint64x2x3_t:void, *mut p8:poly8x16x3_t:void target = aes -//generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void +generate *mut p64:poly64x1x3_t:void, *mut p64:poly64x2x3_t:void target = default -arm = vst3lane -//generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void -//generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void -//generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void +arm = vst3 +generate *mut u8:uint8x8x3_t:void, *mut u16:uint16x4x3_t:void, *mut u32:uint32x2x3_t:void +generate *mut u16:uint16x8x3_t:void, *mut u32:uint32x4x3_t:void +generate *mut p8:poly8x8x3_t:void, *mut p16:poly16x4x3_t:void, *mut p16:poly16x8x3_t:void /// Store multiple 3-element structures from three registers name = vst3 @@ -3053,15 +3053,15 @@ validate 1., 2., 2., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate -aarch64 = st3lane +aarch64 = st3 link-aarch64 = st3lane._EXTpi8_ const-aarch64 = LANE -//generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void +generate *mut f64:float64x1x3_t:void, *mut f64:float64x2x3_t:void -arm = vst3lane +arm = vst3 link-arm = vst3lane._EXTpi8r_ const-arm = LANE -//generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void +generate *mut f32:float32x2x3_t:void, *mut f32:float32x4x3_t:void /// Store multiple 4-element structures from four registers name = vst4 @@ -3136,16 +3136,16 @@ validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, store_fn arm-aarch64-separate -aarch64 = st4lane +aarch64 = st4 link-aarch64 = st4lane._EXTpi8_ const-aarch64 = LANE -//generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void +generate *mut i8:int8x16x4_t:void, *mut i64:int64x1x4_t:void, *mut i64:int64x2x4_t:void -arm = vst4lane +arm = vst4 link-arm = vst4lane._EXTpi8r_ const-arm = LANE -//generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void -//generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void +generate *mut i8:int8x8x4_t:void, *mut i16:int16x4x4_t:void, *mut i32:int32x2x4_t:void +generate *mut i16:int16x8x4_t:void, *mut i32:int32x4x4_t:void /// Store multiple 4-element structures from four registers name = vst4 @@ -3158,16 +3158,16 @@ n = 0 validate 1, 2, 2, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 store_fn -aarch64 = st4lane -//generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void +aarch64 = st4 +generate *mut u8:uint8x16x4_t:void, *mut u64:uint64x1x4_t:void, *mut u64:uint64x2x4_t:void, *mut p8:poly8x16x4_t:void target = aes -//generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void +generate *mut p64:poly64x1x4_t:void, *mut p64:poly64x2x4_t:void target = default -arm = vst4lane -//generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void -//generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void -//generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void +arm = vst4 +generate *mut u8:uint8x8x4_t:void, *mut u16:uint16x4x4_t:void, *mut u32:uint32x2x4_t:void +generate *mut u16:uint16x8x4_t:void, *mut u32:uint32x4x4_t:void +generate *mut p8:poly8x8x4_t:void, *mut p16:poly16x4x4_t:void, *mut p16:poly16x8x4_t:void /// Store multiple 4-element structures from four registers name = vst4 @@ -3180,15 +3180,15 @@ validate 1., 2., 2., 6., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0. store_fn arm-aarch64-separate -aarch64 = st4lane +aarch64 = st4 link-aarch64 = st4lane._EXTpi8_ const-aarch64 = LANE -//generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void +generate *mut f64:float64x1x4_t:void, *mut f64:float64x2x4_t:void -arm = vst4lane +arm = vst4 link-arm = vst4lane._EXTpi8r_ const-arm = LANE -//generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void +generate *mut f32:float32x2x4_t:void, *mut f32:float32x4x4_t:void /// Multiply name = vmul