From 30dee02e6e950839ba81cb43a01572ae4dfd763c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 6 Oct 2022 17:02:11 +0100 Subject: [PATCH 01/25] Add Alpha support for SME2 This patch adds new intrinsics and types for supporting SME2. --- main/acle.md | 2373 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 2371 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index 36d1adf6..2539816b 100644 --- a/main/acle.md +++ b/main/acle.md @@ -1,7 +1,7 @@ --- title: Arm C Language Extensions -version: 2022Q2 -date-of-issue: 01 Jul 2022 +version: 2022Q2 + SME2 (Alpha) +date-of-issue: 06 Oct 2022 # LaTeX specific variables copyright-text: "Copyright: see section \\texorpdfstring{\\nameref{copyright}}{Copyright}." draftversion: true @@ -1698,6 +1698,10 @@ In addition, `__ARM_FEATURE_LOCALLY_STREAMING` is defined to 1 if the [`arm_locally_streaming`](#arm_locally_streaming) attribute is available. +`__ARM_FEATURE_SME2` is defined to 1 if the FEAT_SME2 instructions +are available and if the associated [ACLE +features](#sme-language-extensions-and-intrinsics) are supported. + #### M-profile Vector Extension `__ARM_FEATURE_MVE` is defined as a bitmap to indicate M-profile Vector @@ -2168,6 +2172,7 @@ be found in [[BA]](#BA). | [`__ARM_FEATURE_SM3`](#sm3-extension) | SM3 Crypto extension (Arm v8.4-A, optional Armv8.2-A, Armv8.3-A) | 1 | | [`__ARM_FEATURE_SM4`](#sm4-extension) | SM4 Crypto extension (Arm v8.4-A, optional Armv8.2-A, Armv8.3-A) | 1 | | [`__ARM_FEATURE_SME`](#scalable-matrix-extension-sme) | Scalable Matrix Extension (FEAT_SME) | 1 | +| [`__ARM_FEATURE_SME2`](#scalable-matrix-extension-sme) | Scalable Matrix Extension (FEAT_SME2) | 1 | | [`__ARM_FEATURE_SME_F64F64`](#double-precision-floating-point-outer-product-intrinsics) | Double precision floating-point outer product intrinsics (FEAT_SME_F64F64) | 1 | | [`__ARM_FEATURE_SME_I16I64`](#16-bit-to-64-bit-integer-widening-outer-product-intrinsics) | 16-bit to 64-bit integer widening outer product intrinsics (FEAT_SME_I16I64) | 1 | | [`__ARM_FEATURE_SME_LOCALLY_STREAMING`](#scalable-matrix-extension-sme) | Support for the `arm_locally_streaming` attribute | 1 | @@ -2411,6 +2416,7 @@ The following table lists the architectures feature mapping for AArch64 | 350 | `FEAT_SVE_BitPerm` | sve_bitperm | ```ID_AA64ZFR0_EL1.BitPerm == 0b0001``` | | 360 | `FEAT_SVE_SHA3` | sve_sha3 | ```ID_AA64ZFR0_EL1.SHA3 == 0b0001``` | | 370 | `FEAT_SME` | sme | ```ID_AA64PFR1_EL1.SME == 0b0001``` | + | 380 | `FEAT_SME2` | sme2 | ```ID_AA64PFR1_EL1.SME == 0b0010``` | | 380 | `FEAT_MTE` | mte | ```ID_AA64PFR1_EL1.MTE >= 0b0001``` | | 390 | `FEAT_MTE2` | mte2 | ```ID_AA64PFR1_EL1.MTE >= 0b0010``` | | 400 | `FEAT_MTE3` | mte3 | ```ID_AA64PFR1_EL1.MTE >= 0b0011``` | @@ -8605,6 +8611,19 @@ function F. There are then two cases: The inline asm is [ill-formed](#ill-formed) if it has a `"za"` clobber. +## ZT0 Lookup Table + +When ZA storage is enabled, SME2 additionally provides access to a 64-byte large +lookup table called ZT0 which can be accesses through specialized instructions. +ZT0 is architecturally linked to ZA such that changing PSTATE.ZA enables or +disables both ZA and ZT0 simultaneously. + + +This means that when a function has ZA state, it similarly has ZT state. + + ## SME attributes All of the attributes described in this section can be specified @@ -8755,6 +8774,12 @@ This attribute applies to **function types** and specifies the following: can use ZA to receive data from its callers and to pass data back to its callers. +* The function has [ZT state](#zt-state). + +* The function's ZT state is created on entry to the function and destroyed + on return from the function. That is, the function does not use ZT0 + to receive data from callers or to pass data back to callers. + * If the function forms part of the object code's ABI, that object code function has a “shared-ZA interface”; see [[AAPCS64]](#AAPCS64) for more details. @@ -8770,6 +8795,12 @@ following: on return from the function. That is, the function does not use ZA to receive data from callers or to pass data back to callers. +* The function has [ZT state](#zt-state). + +* The function's ZT state is created on entry to the function and destroyed + on return from the function. That is, the function does not use ZT0 + to receive data from callers or to pass data back to callers. + This attribute does not change a function's binary interface. If the function forms part of the object code's ABI, that object code function has a “private-ZA interface”, just like all other non-`arm_shared_za` @@ -8828,6 +8859,9 @@ depends on whether the function is [shared-ZA](#shared-za) or The platform may place additional requirements as well. +* ZT state is not considered preserved when a function is marked with + [`arm_preserves_za`](#arm_preserves_za). + In both cases, the onus is on the definition of the function to honor the guarantee that is being made. The attribute does not direct the compiler to do anything to honor the guarantee. @@ -9382,6 +9416,2341 @@ are named after. All of the functions have external linkage. void *__arm_sc_memchr(void *s, int c, size_t n); ``` +## SME2 Types + +### Predicate-as-counter + +SME2 adds a new kind of predicate, named *predicate-as-counter* which is used +for multi-vector predication. It describes a predicate mask that can span multiple +predicate registers with `K` `true` values followed by all `false` values, or +`K` `false` values followed by all `true` values, for a given element type. + +When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines a +single sizeless predicate-as-counter type named `svcount_t`. + +`svcount_t` and `svbool_t` are both used to represent predicate masks, but +they cannot be used interchangeably. + +The ACLE allows these types to be casted from one to another using the +`svcount_t svreinterpret_c(svbool_t)` and `svbool_t svreinterpret_b(svcount_t)` +intrinsics, although the reinterpreted values may not be sensible in the other +format. To safely extract a sensible mask from a `svcount_t`, the `svpext` +functions should be used. + +### Multi-vector predicates + +When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines the tuple types +`svboolx2_t` and `svboolx4_t`. + +These are opaque tuple types that can be accessed using the existing SVE +intrinsics `svsetN`, `svgetN` and `svcreateN`. `svundef2` and `svundef4` +are also extended to work with `svboolx2_t` and `svboolx4_t`. e.g. + +``` c + svbool_t svget2[_b](svboolx2_t tuple, uint64_t imm_index); + svboolx2_t svset2[_b](svboolx2_t tuple, uint64_t imm_index, svbool_t x); + svboolx2_t svcreate2[_b](svbool_t x, svbool_t y); + svboolx2_t svundef2_b(); +``` + +## SME2 functions + +The functions in this section are defined by the header file +[``](#arm_sme.h) when `__ARM_FEATURE_SME2` is defined. + +#### Common rules + +SME2 adds operations that work on groups of SVE vectors, ZA tile slices or +ZA array vectors. The intrinsics model this in the following way: + +* Multi-vector operands are groups of SVE data vectors, that use the same + tuple types as defined in the [SVE ACLE](#sve-vector-types), e.g. + `svint32x2_t` for a multi-vector operand of two 32-bit element vectors, or + `svint64x4_t` for a multi-vector operand of four 64-bit element vectors. + +* Intrinsic functions have a `_x2` or `_x4` suffix if the + function\'s return value is a vector group of 2 or 4 data vectors + and the function operates purely on vectors, not on the matrix array or + tile slices. + +* Intrinsic functions have a `_vg2` or `_vg4` suffix if the function + operates on groups of 2 or 4 ZA tiles slices. For example: + +``` c + // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. + __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x2_t svread_vg2_hor_za8[_s8](uint32_t slice_base, + uint64_t slice_x2_offset); +``` + +* The architecture distinguishes between tuples with consecutive registers + and tuples with strided registers. This level of detail is not exposed to + the C/C++ intrinsics or types. It is left up to the compiler to choose the + most optimal form. + +* Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function + operates on 2 or 4 single-vector groups within a ZA array. + +* Intrinsic functions have a `_vg2x1`, `_vg2x2`, `_vg2x4` suffix if + the function operates on 1, 2 or 4 double-vector groups within a ZA array. + +* Intrinsic functions have a `_vg4x1`, `_vg4x2`, `_vg4x4` suffix if the + function operates on 1, 2 or 4 quad-vector groups within a ZA array. + For example: + +``` c + // SMLAL intrinsic for 2 quad-vector groups. + __attributes__((arm_streaming, arm_shared_za)) + void svmlal_vg4x2_lane_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); +``` + +* Every argument named `slice_x2_offset` must be an integer constant + expression that is a multiple of 2. + + These immediates are required for indexing a double-vector group in the ZA + array or to index a vector group of 2 tile-slices. + +* Every argument named `slice_x4_offset` must be an integer constant + expression that is a multiple of 4. + + These immediates are required for indexing a quad-vector group in the ZA + array or to index a vector group of 4 tile-slices. + +* Intrinsic functions that take a multi-vector operand may have additional + suffixes to distinguish them from other forms for the same intrinsic: + * a `_single` suffix if they take one multi-vector operand and one + (single) vector operand. + * a `_lane` suffix if they take one multi-vector operand and one + indexed vector operand with an immediate to specify the indexed + elements. + +``` c + __attributes__((arm_streaming, arm_shared_za)) + void svmlal_vg4x2_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); + + + __attributes__((arm_streaming, arm_shared_za)) + void svmlal_vg4x2[_single]_za32[_s8](uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svint8_t zm); + + __attributes__((arm_streaming, arm_shared_za)) + void svmlal_vg4x2_lane_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); +``` + +### SME2 data-processing instructions. + +#### ADD, SUB (store into ZA, single) + +Multi-vector add/sub, storing into ZA + +The additional '_write' suffix indicates that the operation is not accumulating, + the result is written directly into ZA. + +``` c + // Variants are available for: + // _za32[_s32] + // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_write[_single]_za32[_s32]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, + svint32x2_t zn, svint32_t zm); + + + // Variants are available for: + // _za32[_s32] + // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_write[_single]_za32[_s32]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, + svint32x4_t zn, svint32_t zm); + + + // Variants are available for: + // _za32[_u32] + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, + svuint32x2_t zn, svuint32_t zm); + + + // Variants are available for: + // _za32[_u32] + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, + svuint32x4_t zn, svuint32_t zm); + + ``` + +#### ADD, SUB (store into ZA, multi) + +Multi-vector add/sub, storing into ZA + +The additional '_write' suffix indicates that the operation is not accumulating, + the result is written directly into ZA. + +``` c + // Variants are available for: + // _za32[_s32] + // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_write_za32[_s32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svint32x2_t zn, svint32x2_t zm); + + + // Variants are available for: + // _za32[_s32] + // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_write_za32[_s32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svint32x4_t zn, svint32x4_t zm); + + + // Variants are available for: + // _za32[_u32] + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_write_za32[_u32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svuint32x2_t zn, svuint32x2_t zm); + + + // Variants are available for: + // _za32[_u32] + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_write_za32[_u32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svuint32x4_t zn, svuint32x4_t zm); + + ``` + +#### ADD, SUB (vectors) + +Multi-vector add/sub + +``` c + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svadd[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svadd[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + + ``` + +#### ADD, SUB, FADD, FSUB (accumulate into ZA) + +Multi-vector add/sub and accumulate into ZA + +``` c + // Variants are available for: + // _za32[_f32] + // _za32[_s32] + // _za32[_u32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za32[_s32] + // _za32[_u32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svadd_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za32[_u32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za32[_u32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svsub_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zm); + + ``` + +#### BFCVTN, FCVTN + +Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16 + +``` c + // Variants are also available for _f16[_f32] + __attribute__((arm_streaming)) + svbfloat16_t svcvtn_bf16[_f32][_x2](svfloat32x2_t zn); + + ``` + +#### FCVT, BFCVT, FCVTZS, FCVTZU, SCVTF, UCVTF + +Multi-vector convert to/from floating-point. + +``` c + // Variants are also available for _f16[_f32], _f32[_s32], _f32[_u32], + // _s32[_f32] and _u32[_f32] + __attribute__((arm_streaming)) + svbfloat16_t svcvt_bf16[_f32][_x2](svfloat32x2_t zn); + + + // Variants are also available for _f32[_u32], _s32[_f32] and _u32[_f32] + __attribute__((arm_streaming)) + svfloat32x4_t svcvt_f32[_s32][_x4](svint32x4_t zn); + + ``` + +#### SQCVT, SQCVTU, UQCVT + +Multi-vector saturating extract narrow + +``` c + // Variants are also available for _u16[_s32] and _u16[_u32] + __attribute__((arm_streaming)) + svint16_t svqcvt_s16[_s32][_x2](svint32x2_t zn); + + + // Variants are also available for _s16[_s64], _u8[_s32], _u8[_u32], _u16[_s64] + // and _u16[_u64] + __attribute__((arm_streaming)) + svint8_t svqcvt_s8[_s32][_x4](svint32x4_t zn); + + ``` + +#### SQCVTN, SQCVTUN, UQCVTN + +Multi-vector saturating extract narrow and interleave + +``` c + // Variants are also available for _u16[_s32] and _u16[_u32] + __attribute__((arm_streaming_compatible)) + svint16_t svqcvtn_s16[_s32][_x2](svint32x2_t zn); + + + // Variants are also available for _s16[_s64], _u8[_s32], _u8[_u32], _u16[_s64] + // and _u16[_u64] + __attribute__((arm_streaming)) + svint8_t svqcvtn_s8[_s32][_x4](svint32x4_t zn); + + ``` + +#### UDOT, SDOT, FDOT (vectors) + +Multi-vector dot-product (2-way and 4-way) + +``` c + // Variants are also available for _s32 and _u32 + __attribute__((arm_streaming_compatible)) + svfloat32_t svdot[_f32](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm); + + ``` + +#### UDOT, SDOT, FDOT (indexed) + +Multi-vector dot-product (2-way and 4-way) + +``` c + // Variants are also available for _s32 and _u32 + __attribute__((arm_streaming_compatible)) + svfloat32_t svdot_lane[_f32](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm, + uint64_t imm_idx); + + ``` + +#### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, single) + +Multi-vector dot-product (2-way and 4-way) + +``` c + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot[_single]_za32[_bf16]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, + svbfloat16x2_t zn, svbfloat16_t zm); + + + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot[_single]_za32[_bf16]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, + svbfloat16x4_t zn, svbfloat16_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsudot[_single]_za32[_u8]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, svint8x2_t zn, + svuint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsudot[_single]_za32[_u8]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, svint8x4_t zn, + svuint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot[_single]_za32[_s8]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, svuint8x2_t zn, + svint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot[_single]_za32[_s8]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, svuint8x4_t zn, + svint8_t zm); + + ``` + +#### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, multi) + +Multi-vector dot-product (2-way and 4-way) + +``` c + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svbfloat16x2_t zn, svbfloat16x2_t zm); + + + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot_za32[_bf16]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svbfloat16x4_t zn, svbfloat16x4_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svuint8x2_t zn, svint8x2_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svuint8x4_t zn, svint8x4_t zm); + + ``` + +#### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, indexed) + +Multi-vector dot-product (2-way and 4-way) + +``` c + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot_lane_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svbfloat16x2_t zn, svbfloat16_t zm, + uint64_t imm_idx); + + + // Variants are available for: + // _za32[_bf16] + // _za32[_f16] + // _za32[_s8] + // _za32[_s16] + // _za32[_u8] + // _za32[_u16] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svdot_lane_za32[_bf16]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svbfloat16x4_t zn, svbfloat16_t zm, + uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svint8x2_t zn, svuint8_t zm, + uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svint8x4_t zn, svuint8_t zm, + uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svuint8x2_t zn, svint8_t zm, + uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svuint8x4_t zn, svint8_t zm, + uint64_t imm_idx); + + ``` + +#### FVDOT, BFVDOT, SUVDOT, USVDOT, SVDOT, UVDOT + +Multi-vector vertical dot-product by indexed element. + +``` c + + __attribute__((arm_streaming, arm_shared_za)) + void svsuvdot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svint8x4_t zn, svuint8_t zm, + uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svuint8x4_t zn, svint8_t zm, + uint64_t imm_idx); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svvdot_lane_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svbfloat16x2_t zn, svbfloat16_t zm, + uint64_t imm_idx); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svvdot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + + ``` + +#### UMOPA, SMOPA, UMOPS, SMOPS + +Integer sum of outer products and accumulate/subtract (2-way and 4-way) + +``` c + // Variants are also available for _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmopa_za32[_s16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svint16_t zn, + svint16_t zm); + + + // Variants are also available for _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmops_za32[_s16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svint16_t zn, + svint16_t zm); + + ``` + +#### BMOPA, BMOPS + +Bitwise exclusive NOR population count outer product and accumulate/subtract + +``` c + + __attribute__((arm_streaming, arm_shared_za)) + void svbmopa_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, + svuint32_t zn, svuint32_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svbmops_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, + svuint32_t zn, svuint32_t zm); + + ``` + +#### FMLA, FMLS (single) + +Multi-vector floating-point fused multiply-add/subtract + +``` c + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, svfloat32x2_t zn, + svfloat32_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, svfloat32x4_t zn, + svfloat32_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice_base, + uint64_t slice_offset, svfloat32x2_t zn, + svfloat32_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice_base, + uint64_t slice_offset, svfloat32x4_t zn, + svfloat32_t zm); + + ``` + +#### FMLA, FMLS (multi) + +Multi-vector floating-point fused multiply-add/subtract + +``` c + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zn, svfloat32x2_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zn, svfloat32x4_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zn, svfloat32x2_t zm); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zn, svfloat32x4_t zm); + + ``` + +#### FMLA, FMLS (indexed) + +Multi-vector floating-point fused multiply-add/subtract + +``` c + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla_lane_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zn, svfloat32_t zm, + uint64_t imm_idx); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmla_lane_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zn, svfloat32_t zm, + uint64_t imm_idx); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls_lane_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svfloat32x2_t zn, svfloat32_t zm, + uint64_t imm_idx); + + + // Variants are available for: + // _za32[_f32] + // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmls_lane_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svfloat32x4_t zn, svfloat32_t zm, + uint64_t imm_idx); + + ``` + +#### FMLAL, BFMLAL, SMLAL, UMLAL (single) + +Multi-vector multiply-add long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16_t zn, svbfloat16_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16_t zm); + + ``` + +#### FMLAL, BFMLAL, SMLAL, UMLAL (multi) + +Multi-vector multiply-add long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16x2_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16x4_t zm); + + ``` + +#### FMLAL, BFMLAL, SMLAL, UMLAL (indexed) + +Multi-vector multiply-add long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x4_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + ``` + +#### BFMLSL, FMLSL, UMLSL, SMLSL (single) + +Multi-vector multiply-subtract long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16_t zn, svbfloat16_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16_t zm); + + ``` + +#### BFMLSL, FMLSL, UMLSL, SMLSL (multi) + +Multi-vector multiply-subtract long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16x2_t zm); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16x4_t zm); + + ``` + +#### BFMLSL, FMLSL, UMLSL, SMLSL (indexed) + +Multi-vector multiply-subtract long (widening) + +``` c + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x4_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + ``` + +#### UMLALL, SMLALL, USMLALL, SUMLALL (single) + +Multi-vector multiply-add long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svint8_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svint8_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsumlall[_single]_za32[_u8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, + svint8x2_t zn, svuint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsumlall[_single]_za32[_u8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, + svint8x4_t zn, svuint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8_t zn, svint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8x2_t zn, svint8_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8x4_t zn, svint8_t zm); + + ``` + +#### UMLALL, SMLALL, USMLALL, SUMLALL (multi) + +Multi-vector multiply-add long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8x4_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svuint8x2_t zn, svint8x2_t zm); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svuint8x4_t zn, svint8x4_t zm); + + ``` + +#### UMLALL, SMLALL, USMLALL, SUMLALL (indexed) + +Multi-vector multiply-add long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, + svint8_t zn, svint8_t zm, uint64_t imm_idx); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlal_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsumlall_lane_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svuint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsumlall_lane_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svuint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svsumlall_lane_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svuint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall_lane_za32[_u8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8_t zn, + svint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall_lane_za32[_u8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8x2_t zn, + svint8_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming, arm_shared_za)) + void svusmlall_lane_za32[_u8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8x4_t zn, + svint8_t zm, uint64_t imm_idx); + + ``` + +#### SMLSLL, UMLSLL (single) + +Multi-vector multiply-subtract long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svint8_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svint8_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svint8_t zm); + + ``` + +#### SMLSLL, UMLSLL (multi) + +Multi-vector multiply-subtract long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8x4_t zm); + + ``` + +#### SMLSLL, UMLSLL (indexed) + +Multi-vector multiply-subtract long long (widening) + +``` c + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, + svint8_t zn, svint8_t zm, uint64_t imm_idx); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + + + // Variants are available for: + // _za32[_s8] + // _za32[_u8] + // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) + // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) + __attribute__((arm_streaming, arm_shared_za)) + void svmlsl_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + + ``` + +#### BFMLSLB, BFMLSLT + +BFloat16 floating-point multiply-subtract long from single-precision (top/bottom) + +``` c + + __attribute__((arm_streaming_compatible)) + svfloat32_t svmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); + + + + __attribute__((arm_streaming_compatible)) + svfloat32_t svmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + + + __attribute__((arm_streaming_compatible)) + svfloat32_t svmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); + + + + __attribute__((arm_streaming_compatible)) + svfloat32_t svmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); + + ``` + +#### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (single) + +Multi-vector min/max + +``` c + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x2_t svmax[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x4_t svmax[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x2_t svmin[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x4_t svmin[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + + ``` + +#### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (multi) + +Multi-vector min/max + +``` c + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x2_t svmax[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x4_t svmax[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x2_t svmin[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x4_t svmin[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + + ``` + +#### FMAXNM, FMINNM (single) + +Multi-vector floating point min/max number + +``` c + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x2_t svmaxnm[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x4_t svmaxnm[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x2_t svminnm[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x4_t svminnm[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + + ``` + +#### FMAXNM, FMINNM (multi) + +Multi-vector floating point min/max number + +``` c + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x2_t svmaxnm[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x4_t svmaxnm[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x2_t svminnm[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + + + // Variants are also available for _f32 and _f64 + __attribute__((arm_streaming)) + svfloat16x4_t svminnm[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + + ``` + +#### FRINTA, FRINTM, FRINTN, FRINTP + +Multi-vector floating-point round to integral value + +``` c + + __attribute__((arm_streaming)) + svfloat32x2_t svrinta[_f32][_x2](svfloat32x2_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x4_t svrinta[_f32][_x4](svfloat32x4_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x2_t svrintm[_f32][_x2](svfloat32x2_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x4_t svrintm[_f32][_x4](svfloat32x4_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x2_t svrintn[_f32][_x2](svfloat32x2_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x4_t svrintn[_f32][_x4](svfloat32x4_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x2_t svrintp[_f32][_x2](svfloat32x2_t zn); + + + + __attribute__((arm_streaming)) + svfloat32x4_t svrintp[_f32][_x4](svfloat32x4_t zn); + + ``` + +#### LD1B, LD1D, LD1H, LD1W + +Contiguous load to multi-vector + +``` c + + __attribute__((arm_streaming)) + svuint8x2_t svld1b_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint8x4_t svld1b_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint8x2_t svld1b[_u8]_x2(svcount_t png, const uint8_t *rn); + + + + __attribute__((arm_streaming)) + svuint8x4_t svld1b[_u8]_x4(svcount_t png, const uint8_t *rn); + + + + __attribute__((arm_streaming)) + svuint64x2_t svld1d_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint64x4_t svld1d_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint64x2_t svld1d[_u64]_x2(svcount_t png, const uint64_t *rn); + + + + __attribute__((arm_streaming)) + svuint64x4_t svld1d[_u64]_x4(svcount_t png, const uint64_t *rn); + + + + __attribute__((arm_streaming)) + svuint16x2_t svld1h_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint16x4_t svld1h_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint16x2_t svld1h[_u16]_x2(svcount_t png, const uint16_t *rn); + + + + __attribute__((arm_streaming)) + svuint16x4_t svld1h[_u16]_x4(svcount_t png, const uint16_t *rn); + + + + __attribute__((arm_streaming)) + svuint32x2_t svld1w_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint32x4_t svld1w_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint32x2_t svld1w[_u32]_x2(svcount_t png, const uint32_t *rn); + + + + __attribute__((arm_streaming)) + svuint32x4_t svld1w[_u32]_x4(svcount_t png, const uint32_t *rn); + + ``` + +#### LDNT1B, LDNT1D, LDNT1H, LDNT1W + +Contiguous non-temporal load to multi-vector + +``` c + + __attribute__((arm_streaming)) + svuint8x2_t svldnt1b_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint8x4_t svldnt1b_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint8x2_t svldnt1b[_u8]_x2(svcount_t png, const uint8_t *rn); + + + + __attribute__((arm_streaming)) + svuint8x4_t svldnt1b[_u8]_x4(svcount_t png, const uint8_t *rn); + + + + __attribute__((arm_streaming)) + svuint64x2_t svldnt1d_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint64x4_t svldnt1d_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint64x2_t svldnt1d[_u64]_x2(svcount_t png, const uint64_t *rn); + + + + __attribute__((arm_streaming)) + svuint64x4_t svldnt1d[_u64]_x4(svcount_t png, const uint64_t *rn); + + + + __attribute__((arm_streaming)) + svuint16x2_t svldnt1h_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint16x4_t svldnt1h_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint16x2_t svldnt1h[_u16]_x2(svcount_t png, const uint16_t *rn); + + + + __attribute__((arm_streaming)) + svuint16x4_t svldnt1h[_u16]_x4(svcount_t png, const uint16_t *rn); + + + + __attribute__((arm_streaming)) + svuint32x2_t svldnt1w_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint32x4_t svldnt1w_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); + + + + __attribute__((arm_streaming)) + svuint32x2_t svldnt1w[_u32]_x2(svcount_t png, const uint32_t *rn); + + + + __attribute__((arm_streaming)) + svuint32x4_t svldnt1w[_u32]_x4(svcount_t png, const uint32_t *rn); + + ``` + +#### ST1B, ST1D, ST1H, ST1W + +Contiguous store of multi-vector operand + +``` c + + __attribute__((arm_streaming)) + void svst1b[_u8][_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1b[_u8][_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1b_vnum[_u8][_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1b_vnum[_u8][_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1d[_u64][_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1d[_u64][_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1d_vnum[_u64][_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1d_vnum[_u64][_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1h[_u16][_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1h[_u16][_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1h_vnum[_u16][_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1h_vnum[_u16][_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1w[_u32][_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1w[_u32][_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); + + + + __attribute__((arm_streaming)) + void svst1w_vnum[_u32][_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); + + + + __attribute__((arm_streaming)) + void svst1w_vnum[_u32][_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); + + ``` + +#### STNT1B, STNT1D, STNT1H, STNT1W + +Contiguous non-temporal store of multi-vector operand + +``` c + + __attribute__((arm_streaming)) + void svstnt1b[_u8][_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1b[_u8][_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1b_vnum[_u8][_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1b_vnum[_u8][_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1d[_u64][_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1d[_u64][_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1d_vnum[_u64][_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1d_vnum[_u64][_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1h[_u16][_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1h[_u16][_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1h_vnum[_u16][_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1h_vnum[_u16][_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1w[_u32][_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1w[_u32][_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1w_vnum[_u32][_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); + + + + __attribute__((arm_streaming)) + void svstnt1w_vnum[_u32][_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); + + ``` + +#### MOVT + +Move 8 bytes between general-purpose register and ZT0 + +``` c + + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + uint64_t svread_lane_zt(uint64_t zt, uint64_t imm_x8_offset); + + + + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + void svwrite_lane_zt(uint64_t zt, uint64_t imm_x8_offset, uint64_t rt); + + ``` + +#### LDR, STR + +Spill and fill of ZT0 + +``` c + + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + void svldr_zt(uint64_t zt, const uint8_t *rn); + + + + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + void svstr_zt(uint64_t zt, uint8_t *rn); + + ``` + +#### ZERO + +Zero ZT0 + +``` c + + __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) + void svzero_zt(uint64_t zt); + + ``` + +#### LUTI2, LUTI4 + +Lookup table read with 2-bit and 4-bit indexes + +``` c + // Variants are also available for _zt[_u16] and _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint8_t svluti2_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx); + + + // Variants are also available for _zt[_u16] and _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint8x2_t svluti2_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, + uint64_t imm_idx); + + + // Variants are also available for _zt[_u16] and _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint8x4_t svluti2_lane_zt[_u8]_x4(uint64_t zt, svuint8_t zn, + uint64_t imm_idx); + + + // Variants are also available for _zt[_u16] and _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint8_t svluti4_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx); + + + // Variants are also available for _zt[_u16] and _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint8x2_t svluti4_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, + uint64_t imm_idx); + + + // Variants are also available for _zt[_u32] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svuint16x4_t svluti4_lane_zt[_u16]_x4(uint64_t zt, svuint16_t zn, + uint64_t imm_idx); + + ``` + +#### MOVA + +Move multi-vectors to/from ZA + +``` c + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x2_t svread_hor_za8[_s8]_vg2(uint32_t slice_base, + uint64_t slice_x2_offset); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x4_t svread_hor_za8[_s8]_vg4(uint32_t slice_base, + uint64_t slice_x4_offset); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x2_t svread_ver_za8[_s8]_vg2(uint32_t slice_base, + uint64_t slice_x2_offset); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x4_t svread_ver_za8[_s8]_vg4(uint32_t slice_base, + uint64_t slice_x4_offset); + + + // Variants are also available for _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint64x2_t svread_za64[_s64]_vg1x2(uint32_t slice_base, + uint64_t slice_offset); + + + // Variants are also available for _za64[_u64] + __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint64x4_t svread_za64[_s64]_vg1x4(uint32_t slice_base, + uint64_t slice_offset); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_hor_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, + svint8x2_t zn); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_hor_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_ver_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, + svint8x2_t zn); + + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], + // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_ver_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn); + + + // Variants are also available for _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_za64[_s64]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + svint64x2_t zn); + + + // Variants are also available for _za64[_u64] + __attribute__((arm_streaming, arm_shared_za)) + void svwrite_za64[_s64]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + svint64x4_t zn); + + ``` + +#### PTRUE + +Initialise predicate-as-counter to all active + +``` c + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svptrue_c8(); + + ``` + +#### PEXT + +Transform a predicate-as-counter to a predicate (pair). + +``` c + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svbool_t svpext_lane_c8(svcount_t pnn, uint64_t imm); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svboolx2_t svpext_lane_c8_x2(svcount_t pnn, uint64_t imm); + + ``` + +#### PSEL + +Predicate select between predicate value or all-false + +``` c + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svbool_t svpsel_lane_b8(svbool_t pn, svbool_t pm, uint32_t idx, uint64_t imm); + + ``` + +#### CNTP + +Set scalar to count from predicate-as-counter + + +``` c + enum sv_vgkind { + SV_VLx2 = 0, + SV_VLx4 = 1 + } +``` + +``` c + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + uint64_t svcntp_c8(svcount_t pnn, sv_vgkind vl); + + ``` + +#### UCLAMP, SCLAMP, FCLAMP + +Multi-vector clamp to minimum/maximum vector + +``` c + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming_compatible)) + svfloat16_t svclamp[_f16](svfloat16_t zd, svfloat16_t zn, svfloat16_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x2_t svclamp[_single][_f16][_x2](svfloat16x2_t zd, svfloat16_t zn, + svfloat16_t zm); + + + // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, + // _u32 and _u64 + __attribute__((arm_streaming)) + svfloat16x4_t svclamp[_single][_f16][_x4](svfloat16x4_t zd, svfloat16_t zn, + svfloat16_t zm); + + ``` + +#### REVD + +Reverse 64-bit doublewords in elements + +``` c + // Variants are also available for _u16, _u32 and _u64 + __attribute__((arm_streaming_compatible)) + svuint8_t svrevd[_u8]_m(svuint8_t zd, svbool_t pg, svuint8_t zn); + + + // Variants are also available for _u16, _u32 and _u64 + __attribute__((arm_streaming_compatible)) + svuint8_t svrevd[_u8]_x(svbool_t pg, svuint8_t zn); + + + // Variants are also available for _u16, _u32 and _u64 + __attribute__((arm_streaming_compatible)) + svuint8_t svrevd[_u8]_z(svbool_t pg, svuint8_t zn); + + ``` + +#### SEL + +Multi-vector conditionally select elements from two vectors + +``` c + // Variants are also available for _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svuint8x2_t svsel[_u8][_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm); + + + // Variants are also available for _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svuint8x4_t svsel[_u8][_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm); + + ``` + +#### URSHL, SRSHL (single) + +Multi-vector rounding shift left + +``` c + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svrshl[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svrshl[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + + ``` + +#### URSHL, SRSHL (multi) + +Multi-vector rounding shift left + +``` c + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svrshl[_s8][_x2](svint8x2_t zdn, svint8x2_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svrshl[_s8][_x4](svint8x4_t zdn, svint8x4_t zm); + + ``` + +#### SQRSHR, UQRSHR + +Multi-vector saturating rounding shift right narrow + +``` c + + __attribute__((arm_streaming)) + svint16_t svqrshr[_s16][_x2](svint32x2_t zn, uint64_t imm_1_8); + + + // Variants are also available for _s16 + __attribute__((arm_streaming)) + svint8_t svqrshr[_s8][_x4](svint32x4_t zn, uint64_t imm_1_8); + + + + __attribute__((arm_streaming)) + svuint16_t svuqrshr[_u16][_x2](svuint32x2_t zn, uint64_t imm_1_8); + + + // Variants are also available for _u16 + __attribute__((arm_streaming)) + svuint8_t svuqrshr[_u8][_x4](svuint32x4_t zn, uint64_t imm_1_8); + + ``` + +#### SQRSHRN, UQRSHRN + +Multi-vector saturating rounding shift right narrow and interleave + +``` c + // Variants are also available for _u16 + __attribute__((arm_streaming_compatible)) + svint16_t svqrshrn[_s16][_x2](svint32x2_t zn, uint64_t imm_1_8); + + + // Variants are also available for _s16, _u8 and _u16 + __attribute__((arm_streaming)) + svint8_t svqrshrn[_s8][_x4](svint32x4_t zn, uint64_t imm_1_8); + + ``` + +#### SQRSHRU + +Multi-vector saturating rounding shift right unsigned narrow + +``` c + + __attribute__((arm_streaming)) + svuint16_t svsqrshru[_u16][_x2](svint32x2_t zn, uint64_t imm_1_8); + + + // Variants are also available for _u16 + __attribute__((arm_streaming)) + svuint8_t svsqrshru[_u8][_x4](svint32x4_t zn, uint64_t imm_1_8); + + ``` + +#### SQRSHRUN + +Multi-vector saturating rounding shift right unsigned narrow and interleave + +``` c + + __attribute__((arm_streaming_compatible)) + svuint16_t svsqrshrun[_u16][_x2](svint32x2_t zn, uint64_t imm_1_8); + + + // Variants are also available for _u16 + __attribute__((arm_streaming)) + svuint8_t svsqrshrun[_u8][_x4](svint32x4_t zn, uint64_t imm_1_8); + + ``` + +#### SQDMULH (single) + +Multi-vector signed saturating doubling multiply high + +``` c + // Variants are also available for _s16, _s32 and _s64 + __attribute__((arm_streaming)) + svint8x2_t svsqdmulh[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + + + // Variants are also available for _s16, _s32 and _s64 + __attribute__((arm_streaming)) + svint8x4_t svsqdmulh[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + + ``` + +#### SQDMULH (multi) + +Multi-vector signed saturating doubling multiply high + +``` c + // Variants are also available for _s16, _s32 and _s64 + __attribute__((arm_streaming)) + svint8x2_t svsqdmulh[_s8][_x2](svint8x2_t zdn, svint8x2_t zm); + + + // Variants are also available for _s16, _s32 and _s64 + __attribute__((arm_streaming)) + svint8x4_t svsqdmulh[_s8][_x4](svint8x4_t zdn, svint8x4_t zm); + + ``` + +#### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT + +While (resulting in predicate-as-counter) + +``` c + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilege_c8(int64_t rn, int64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilegt_c8(int64_t rn, int64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilele_c8(int64_t rn, int64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + + + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming)) + svcount_t svwhilelt_c8(int64_t rn, int64_t rm, sv_vgkind vl); + + ``` + +#### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT + +While (resulting in predicate tuple) + +``` c + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilege_b8_x2(int64_t rn, int64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilegt_b8_x2(int64_t rn, int64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilehi_b8_x2(uint64_t rn, uint64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilehs_b8_x2(uint64_t rn, uint64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilele_b8_x2(int64_t rn, int64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilelo_b8_x2(uint64_t rn, uint64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilels_b8_x2(uint64_t rn, uint64_t rm); + + + // Variants are also available for _b16, _b32 and _b64 + __attribute__((arm_streaming_compatible)) + svboolx2_t svwhilelt_b8_x2(int64_t rn, int64_t rm); + + ``` + +#### SUNPK, UUNPK + +Multi-vector pack/unpack + +``` c + // Variants are also available for _s32, _s64, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint16x2_t svunpk[_s16][_x2](svint8_t zn); + + + // Variants are also available for _s32, _s64, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint16x4_t svunpk[_s16][_x4](svint8x2_t zn); + + ``` + +#### ZIP, UZP + +Multi-vector zip/unzip (2 vectors) + +The uzipq instructions operate on quad-words, but for convenience accept all element types. + +``` c + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm); + + ``` + +#### ZIP, UZP + +Multi-vector zip/unzip (4 vectors) + +The zipq instructions operate on quad-words, but for convenience accept all element types. + +``` c + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + + + // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + __attribute__((arm_streaming)) + svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + + ``` + + + # M-profile Vector Extension (MVE) intrinsics The M-profile Vector Extension (MVE) [[MVE-spec]](#MVE-spec) instructions provide packed Single From d48ed1e19fdaca1c0518804562bf19d259095173 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 10 Nov 2022 16:47:59 +0000 Subject: [PATCH 02/25] Address review comments [to squash] * Addressed suggestions in descriptions * Removed unnecessary newlines * Merged optional suffixes e.g. `[_single][_u16][_x2]` -> `[_single_u16_x2]` * Renamed svuqrshr -> svqrshr (the signedness can be deduced form the operands and there wasn't a svsqrshr either for SQRSHR) * Replaced `enum sv_vgkind` by `uint64_t` immediate. * Generalised `imm_1_8` into `imm` since this immediate wasn't correct for all types. --- main/acle.md | 755 ++++++++++++++++++++++++++------------------------- 1 file changed, 380 insertions(+), 375 deletions(-) diff --git a/main/acle.md b/main/acle.md index 2539816b..80d3b770 100644 --- a/main/acle.md +++ b/main/acle.md @@ -1,7 +1,7 @@ --- title: Arm C Language Extensions -version: 2022Q2 + SME2 (Alpha) -date-of-issue: 06 Oct 2022 +version: 2022Q2 +date-of-issue: 01 Jul 2022 # LaTeX specific variables copyright-text: "Copyright: see section \\texorpdfstring{\\nameref{copyright}}{Copyright}." draftversion: true @@ -8614,14 +8614,15 @@ function F. There are then two cases: ## ZT0 Lookup Table When ZA storage is enabled, SME2 additionally provides access to a 64-byte large -lookup table called ZT0 which can be accesses through specialized instructions. +lookup table called ZT0 which can be accessed through specialized instructions. ZT0 is architecturally linked to ZA such that changing PSTATE.ZA enables or disables both ZA and ZT0 simultaneously. -This means that when a function has ZA state, it similarly has ZT state. +This means that when the hardware supports SME2, a function that has +[ZA state](#za-state) also has ZT state. ## SME attributes @@ -8774,9 +8775,8 @@ This attribute applies to **function types** and specifies the following: can use ZA to receive data from its callers and to pass data back to its callers. -* The function has [ZT state](#zt-state). - -* The function's ZT state is created on entry to the function and destroyed +* When the hardware supports SME2, the function has [ZT state](#zt-state). + The function's ZT state is created on entry to the function and destroyed on return from the function. That is, the function does not use ZT0 to receive data from callers or to pass data back to callers. @@ -8795,9 +8795,8 @@ following: on return from the function. That is, the function does not use ZA to receive data from callers or to pass data back to callers. -* The function has [ZT state](#zt-state). - -* The function's ZT state is created on entry to the function and destroyed +* When the hardware supports SME2, the function has [ZT state](#zt-state). + The function's ZT state is created on entry to the function and destroyed on return from the function. That is, the function does not use ZT0 to receive data from callers or to pass data back to callers. @@ -8859,7 +8858,7 @@ depends on whether the function is [shared-ZA](#shared-za) or The platform may place additional requirements as well. -* ZT state is not considered preserved when a function is marked with +* ZT state is also considered preserved when a function is marked with [`arm_preserves_za`](#arm_preserves_za). In both cases, the onus is on the definition of the function to honor @@ -9468,26 +9467,26 @@ ZA array vectors. The intrinsics model this in the following way: `svint32x2_t` for a multi-vector operand of two 32-bit element vectors, or `svint64x4_t` for a multi-vector operand of four 64-bit element vectors. +* The architecture distinguishes between multi-vector operands with + consecutive registers and multi-vector operands with strided registers. + This level of detail is not exposed to the C/C++ intrinsics or types. It is + left up to the compiler to choose the most optimal form. + * Intrinsic functions have a `_x2` or `_x4` suffix if the function\'s return value is a vector group of 2 or 4 data vectors and the function operates purely on vectors, not on the matrix array or tile slices. * Intrinsic functions have a `_vg2` or `_vg4` suffix if the function - operates on groups of 2 or 4 ZA tiles slices. For example: + operates on groups of 2 or 4 ZA tile slices. For example: ``` c // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_vg2_hor_za8[_s8](uint32_t slice_base, + svint8x2_t svread_hor_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset); ``` -* The architecture distinguishes between tuples with consecutive registers - and tuples with strided registers. This level of detail is not exposed to - the C/C++ intrinsics or types. It is left up to the compiler to choose the - most optimal form. - * Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function operates on 2 or 4 single-vector groups within a ZA array. @@ -9501,7 +9500,7 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. __attributes__((arm_streaming, arm_shared_za)) - void svmlal_vg4x2_lane_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9527,17 +9526,17 @@ ZA array vectors. The intrinsics model this in the following way: ``` c __attributes__((arm_streaming, arm_shared_za)) - void svmlal_vg4x2_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + void svmlal_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svint8x2_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmlal_vg4x2[_single]_za32[_s8](uint32_t slice_base, + void svmlal[_single]_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svint8_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmlal_vg4x2_lane_za32[_s8](uint32_t slice_base, uint64_t slice_x4_offset, + void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9642,14 +9641,18 @@ The additional '_write' suffix indicates that the operation is not accumulating, Multi-vector add/sub ``` c - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _single_u8_x2, _single_s16_x2, + // _single_u16_x2, _single_s32_x2, _single_u32_x2, _single_s64_x2 and + // _single_u64_x2 __attribute__((arm_streaming)) - svint8x2_t svadd[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + svint8x2_t svadd[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _single_u8_x4, _single_s16_x4, + // _single_u16_x4, _single_s32_x4, _single_u32_x4, _single_s64_x4 and + // _single_u64_x4 __attribute__((arm_streaming)) - svint8x4_t svadd[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + svint8x4_t svadd[_single_s8_x4](svint8x4_t zdn, svint8_t zm); ``` @@ -9708,9 +9711,9 @@ Multi-vector add/sub and accumulate into ZA Multi-vector floating-point convert from single-precision to interleaved half-precision/BFloat16 ``` c - // Variants are also available for _f16[_f32] + // Variants are also available for _f16[_f32_x2] __attribute__((arm_streaming)) - svbfloat16_t svcvtn_bf16[_f32][_x2](svfloat32x2_t zn); + svbfloat16_t svcvtn_bf16[_f32_x2](svfloat32x2_t zn); ``` @@ -9719,15 +9722,19 @@ Multi-vector floating-point convert from single-precision to interleaved half-pr Multi-vector convert to/from floating-point. ``` c - // Variants are also available for _f16[_f32], _f32[_s32], _f32[_u32], - // _s32[_f32] and _u32[_f32] + // Variants are also available for _f16[_f32_x2] __attribute__((arm_streaming)) - svbfloat16_t svcvt_bf16[_f32][_x2](svfloat32x2_t zn); + svbfloat16_t svcvt_bf16[_f32_x2](svfloat32x2_t zn); + + + // Variants are also available for _f32[_u32_x2], _s32[_f32_x2] and _u32[_f32_x2] + __attribute__((arm_streaming)) + svfloat32x2_t svcvt_f32[_s32_x2](svint32x2_t zn); - // Variants are also available for _f32[_u32], _s32[_f32] and _u32[_f32] + // Variants are also available for _f32[_u32_x4], _s32[_f32_x4] and _u32[_f32_x4] __attribute__((arm_streaming)) - svfloat32x4_t svcvt_f32[_s32][_x4](svint32x4_t zn); + svfloat32x4_t svcvt_f32[_s32_x4](svint32x4_t zn); ``` @@ -9736,15 +9743,15 @@ Multi-vector convert to/from floating-point. Multi-vector saturating extract narrow ``` c - // Variants are also available for _u16[_s32] and _u16[_u32] + // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] __attribute__((arm_streaming)) - svint16_t svqcvt_s16[_s32][_x2](svint32x2_t zn); + svint16_t svqcvt_s16[_s32_x2](svint32x2_t zn); - // Variants are also available for _s16[_s64], _u8[_s32], _u8[_u32], _u16[_s64] - // and _u16[_u64] + // Variants are also available for _u8[_s32_x4], _u8[_u32_x4], _s16[_s64_x4], + // _u16[_s64_x4] and _u16[_u64_x4] __attribute__((arm_streaming)) - svint8_t svqcvt_s8[_s32][_x4](svint32x4_t zn); + svint8_t svqcvt_s8[_s32_x4](svint32x4_t zn); ``` @@ -9753,15 +9760,15 @@ Multi-vector saturating extract narrow Multi-vector saturating extract narrow and interleave ``` c - // Variants are also available for _u16[_s32] and _u16[_u32] + // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] __attribute__((arm_streaming_compatible)) - svint16_t svqcvtn_s16[_s32][_x2](svint32x2_t zn); + svint16_t svqcvtn_s16[_s32_x2](svint32x2_t zn); - // Variants are also available for _s16[_s64], _u8[_s32], _u8[_u32], _u16[_s64] - // and _u16[_u64] + // Variants are also available for _u8[_s32_x4], _u8[_u32_x4], _s16[_s64_x4], + // _u16[_s64_x4] and _u16[_u64_x4] __attribute__((arm_streaming)) - svint8_t svqcvtn_s8[_s32][_x4](svint32x4_t zn); + svint8_t svqcvtn_s8[_s32_x4](svint32x4_t zn); ``` @@ -9823,28 +9830,24 @@ Multi-vector dot-product (2-way and 4-way) svbfloat16x4_t zn, svbfloat16_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svsudot[_single]_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svint8x2_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svsudot[_single]_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svint8x4_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusdot[_single]_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svuint8x2_t zn, svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusdot[_single]_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, @@ -9885,13 +9888,11 @@ Multi-vector dot-product (2-way and 4-way) svbfloat16x4_t zn, svbfloat16x4_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusdot_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svuint8x2_t zn, svint8x2_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusdot_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, svint8x4_t zm); @@ -9933,28 +9934,24 @@ Multi-vector dot-product (2-way and 4-way) uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, svint8_t zm, @@ -9974,7 +9971,6 @@ Multi-vector vertical dot-product by indexed element. uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, svint8_t zm, @@ -10028,7 +10024,6 @@ Bitwise exclusive NOR population count outer product and accumulate/subtract svuint32_t zn, svuint32_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svbmops_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm); @@ -10339,35 +10334,30 @@ Multi-vector multiply-add long long (widening) svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svsumlall[_single]_za32[_u8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svsumlall[_single]_za32[_u8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, svint8x4_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall[_single]_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, svuint8_t zn, svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall[_single]_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svuint8x2_t zn, svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall[_single]_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, @@ -10400,13 +10390,11 @@ Multi-vector multiply-add long long (widening) svint8x4_t zn, svint8x4_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svuint8x2_t zn, svint8x2_t zm); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, svuint8x4_t zn, svint8x4_t zm); @@ -10448,42 +10436,36 @@ Multi-vector multiply-add long long (widening) svint8x4_t zn, svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svsumlall_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, svint8_t zn, svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svsumlall_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svsumlall_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall_lane_za32[_u8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, svuint8_t zn, svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall_lane_za32[_u8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) void svusmlall_lane_za32[_u8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, svuint8x4_t zn, @@ -10602,18 +10584,15 @@ BFloat16 floating-point multiply-subtract long from single-precision (top/bottom svfloat32_t svmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - __attribute__((arm_streaming_compatible)) svfloat32_t svmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); - __attribute__((arm_streaming_compatible)) svfloat32_t svmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - __attribute__((arm_streaming_compatible)) svfloat32_t svmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); @@ -10625,28 +10604,32 @@ BFloat16 floating-point multiply-subtract long from single-precision (top/bottom Multi-vector min/max ``` c - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x2, _single_u8_x2, + // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, + // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmax[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + svfloat16x2_t svmax[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x4, _single_u8_x4, + // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, + // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmax[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + svfloat16x4_t svmax[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x2, _single_u8_x2, + // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, + // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmin[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + svfloat16x2_t svmin[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x4, _single_u8_x4, + // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, + // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmin[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + svfloat16x4_t svmin[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); ``` @@ -10655,28 +10638,28 @@ Multi-vector min/max Multi-vector min/max ``` c - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, + // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmax[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + svfloat16x2_t svmax[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, + // _s32_x4, _u32_x4, _f32_x4, _s64_x4, _u64_x4 and _f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmax[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + svfloat16x4_t svmax[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, + // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmin[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + svfloat16x2_t svmin[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, + // _s32_x4, _u32_x4, _f32_x4, _s64_x4,_u64_x4 and _f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmin[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + svfloat16x4_t svmin[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); ``` @@ -10685,24 +10668,24 @@ Multi-vector min/max Multi-vector floating point min/max number ``` c - // Variants are also available for _f32 and _f64 + // Variants are also available for _single_f32_x2 and _single_f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmaxnm[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + svfloat16x2_t svmaxnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _single_f32_x4 and _single_f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmaxnm[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + svfloat16x4_t svmaxnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _single_f32_x2 and _single_f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svminnm[_single][_f16][_x2](svfloat16x2_t zdn, svfloat16_t zm); + svfloat16x2_t svminnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _single_f32_x4 and _single_f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svminnm[_single][_f16][_x4](svfloat16x4_t zdn, svfloat16_t zm); + svfloat16x4_t svminnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); ``` @@ -10711,24 +10694,24 @@ Multi-vector floating point min/max number Multi-vector floating point min/max number ``` c - // Variants are also available for _f32 and _f64 + // Variants are also available for _f32_x2 and _f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svmaxnm[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + svfloat16x2_t svmaxnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _f32_x4 and _f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svmaxnm[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + svfloat16x4_t svmaxnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _f32_x2 and _f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svminnm[_f16][_x2](svfloat16x2_t zdn, svfloat16x2_t zm); + svfloat16x2_t svminnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - // Variants are also available for _f32 and _f64 + // Variants are also available for _f32_x4 and _f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svminnm[_f16][_x4](svfloat16x4_t zdn, svfloat16x4_t zm); + svfloat16x4_t svminnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); ``` @@ -10739,42 +10722,35 @@ Multi-vector floating-point round to integral value ``` c __attribute__((arm_streaming)) - svfloat32x2_t svrinta[_f32][_x2](svfloat32x2_t zn); - + svfloat32x2_t svrinta[_f32_x2](svfloat32x2_t zn); __attribute__((arm_streaming)) - svfloat32x4_t svrinta[_f32][_x4](svfloat32x4_t zn); - + svfloat32x4_t svrinta[_f32_x4](svfloat32x4_t zn); __attribute__((arm_streaming)) - svfloat32x2_t svrintm[_f32][_x2](svfloat32x2_t zn); - + svfloat32x2_t svrintm[_f32_x2](svfloat32x2_t zn); __attribute__((arm_streaming)) - svfloat32x4_t svrintm[_f32][_x4](svfloat32x4_t zn); - + svfloat32x4_t svrintm[_f32_x4](svfloat32x4_t zn); __attribute__((arm_streaming)) - svfloat32x2_t svrintn[_f32][_x2](svfloat32x2_t zn); - + svfloat32x2_t svrintn[_f32_x2](svfloat32x2_t zn); __attribute__((arm_streaming)) - svfloat32x4_t svrintn[_f32][_x4](svfloat32x4_t zn); - + svfloat32x4_t svrintn[_f32_x4](svfloat32x4_t zn); __attribute__((arm_streaming)) - svfloat32x2_t svrintp[_f32][_x2](svfloat32x2_t zn); - + svfloat32x2_t svrintp[_f32_x2](svfloat32x2_t zn); __attribute__((arm_streaming)) - svfloat32x4_t svrintp[_f32][_x4](svfloat32x4_t zn); + svfloat32x4_t svrintp[_f32_x4](svfloat32x4_t zn); ``` @@ -10784,91 +10760,92 @@ Contiguous load to multi-vector ``` c + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x2_t svld1b_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum); - + svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x4_t svld1b_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum); - + svuint8x4_t svld1[_u8]_x4(svcount_t png, const uint8_t *rn); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x2_t svld1b[_u8]_x2(svcount_t png, const uint8_t *rn); - + svuint8x2_t svld1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x4_t svld1b[_u8]_x4(svcount_t png, const uint8_t *rn); - + svuint8x4_t svld1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x2_t svld1d_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum); - + svuint16x2_t svld1[_u16]_x2(svcount_t png, const uint16_t *rn); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x4_t svld1d_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum); - + svuint16x4_t svld1[_u16]_x4(svcount_t png, const uint16_t *rn); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x2_t svld1d[_u64]_x2(svcount_t png, const uint64_t *rn); - + svuint16x2_t svld1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x4_t svld1d[_u64]_x4(svcount_t png, const uint64_t *rn); - + svuint16x4_t svld1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x2_t svld1h_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum); - + svuint32x2_t svld1[_u32]_x2(svcount_t png, const uint32_t *rn); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x4_t svld1h_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum); - + svuint32x4_t svld1[_u32]_x4(svcount_t png, const uint32_t *rn); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x2_t svld1h[_u16]_x2(svcount_t png, const uint16_t *rn); - + svuint32x2_t svld1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x4_t svld1h[_u16]_x4(svcount_t png, const uint16_t *rn); - + svuint32x4_t svld1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x2_t svld1w_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum); - + svuint64x2_t svld1[_u64]_x2(svcount_t png, const uint64_t *rn); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x4_t svld1w_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum); - + svuint64x4_t svld1[_u64]_x4(svcount_t png, const uint64_t *rn); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x2_t svld1w[_u32]_x2(svcount_t png, const uint32_t *rn); - + svuint64x2_t svld1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x4_t svld1w[_u32]_x4(svcount_t png, const uint32_t *rn); + svuint64x4_t svld1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); ``` @@ -10878,91 +10855,92 @@ Contiguous non-temporal load to multi-vector ``` c + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x2_t svldnt1b_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum); - + svuint8x2_t svldnt1[_u8]_x2(svcount_t png, const uint8_t *rn); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x4_t svldnt1b_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum); - + svuint8x4_t svldnt1[_u8]_x4(svcount_t png, const uint8_t *rn); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x2_t svldnt1b[_u8]_x2(svcount_t png, const uint8_t *rn); - + svuint8x2_t svldnt1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, + int64_t vnum); + // Variants are also available for _s8 __attribute__((arm_streaming)) - svuint8x4_t svldnt1b[_u8]_x4(svcount_t png, const uint8_t *rn); - + svuint8x4_t svldnt1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, + int64_t vnum); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x2_t svldnt1d_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum); - + svuint16x2_t svldnt1[_u16]_x2(svcount_t png, const uint16_t *rn); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x4_t svldnt1d_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum); - + svuint16x4_t svldnt1[_u16]_x4(svcount_t png, const uint16_t *rn); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x2_t svldnt1d[_u64]_x2(svcount_t png, const uint64_t *rn); - + svuint16x2_t svldnt1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, + int64_t vnum); + // Variants are also available for _s16, _f16 and _bf16 __attribute__((arm_streaming)) - svuint64x4_t svldnt1d[_u64]_x4(svcount_t png, const uint64_t *rn); - + svuint16x4_t svldnt1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, + int64_t vnum); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x2_t svldnt1h_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum); - + svuint32x2_t svldnt1[_u32]_x2(svcount_t png, const uint32_t *rn); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x4_t svldnt1h_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum); - + svuint32x4_t svldnt1[_u32]_x4(svcount_t png, const uint32_t *rn); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x2_t svldnt1h[_u16]_x2(svcount_t png, const uint16_t *rn); - + svuint32x2_t svldnt1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, + int64_t vnum); + // Variants are also available for _s32 and _f32 __attribute__((arm_streaming)) - svuint16x4_t svldnt1h[_u16]_x4(svcount_t png, const uint16_t *rn); - + svuint32x4_t svldnt1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, + int64_t vnum); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x2_t svldnt1w_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum); - + svuint64x2_t svldnt1[_u64]_x2(svcount_t png, const uint64_t *rn); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x4_t svldnt1w_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum); - + svuint64x4_t svldnt1[_u64]_x4(svcount_t png, const uint64_t *rn); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x2_t svldnt1w[_u32]_x2(svcount_t png, const uint32_t *rn); - + svuint64x2_t svldnt1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, + int64_t vnum); + // Variants are also available for _s64 and _f64 __attribute__((arm_streaming)) - svuint32x4_t svldnt1w[_u32]_x4(svcount_t png, const uint32_t *rn); + svuint64x4_t svldnt1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, + int64_t vnum); ``` @@ -10972,91 +10950,92 @@ Contiguous store of multi-vector operand ``` c + // Variants are also available for _s8_x2 __attribute__((arm_streaming)) - void svst1b[_u8][_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); - + void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); + // Variants are also available for _s8_x4 __attribute__((arm_streaming)) - void svst1b[_u8][_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); - + void svst1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); + // Variants are also available for _s8_x2 __attribute__((arm_streaming)) - void svst1b_vnum[_u8][_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt); - + void svst1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); + // Variants are also available for _s8_x4 __attribute__((arm_streaming)) - void svst1b_vnum[_u8][_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt); - + void svst1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 __attribute__((arm_streaming)) - void svst1d[_u64][_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); - + void svst1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 __attribute__((arm_streaming)) - void svst1d[_u64][_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - + void svst1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 __attribute__((arm_streaming)) - void svst1d_vnum[_u64][_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt); - - + void svst1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); + + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 __attribute__((arm_streaming)) - void svst1d_vnum[_u64][_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt); - + void svst1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); + // Variants are also available for _s32_x2 and _f32_x2 __attribute__((arm_streaming)) - void svst1h[_u16][_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); - + void svst1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); + // Variants are also available for _s32_x4 and _f32_x4 __attribute__((arm_streaming)) - void svst1h[_u16][_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - + void svst1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); + // Variants are also available for _s32_x2 and _f32_x2 __attribute__((arm_streaming)) - void svst1h_vnum[_u16][_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt); - + void svst1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); + // Variants are also available for _s32_x4 and _f32_x4 __attribute__((arm_streaming)) - void svst1h_vnum[_u16][_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt); - + void svst1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); + // Variants are also available for _s64_x2 and _f64_x2 __attribute__((arm_streaming)) - void svst1w[_u32][_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); - + void svst1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); + // Variants are also available for _s64_x4 and _f64_x4 __attribute__((arm_streaming)) - void svst1w[_u32][_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); - + void svst1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); + // Variants are also available for _s64_x2 and _f64_x2 __attribute__((arm_streaming)) - void svst1w_vnum[_u32][_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt); - + void svst1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); + // Variants are also available for _s64_x4 and _f64_x4 __attribute__((arm_streaming)) - void svst1w_vnum[_u32][_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt); + void svst1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); ``` @@ -11066,91 +11045,92 @@ Contiguous non-temporal store of multi-vector operand ``` c + // Variants are also available for _s8_x2 __attribute__((arm_streaming)) - void svstnt1b[_u8][_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); - + void svstnt1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); + // Variants are also available for _s8_x4 __attribute__((arm_streaming)) - void svstnt1b[_u8][_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); - + void svstnt1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); + // Variants are also available for _s8_x2 __attribute__((arm_streaming)) - void svstnt1b_vnum[_u8][_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt); - + void svstnt1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x2_t zt); + // Variants are also available for _s8_x4 __attribute__((arm_streaming)) - void svstnt1b_vnum[_u8][_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt); - + void svstnt1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, + svuint8x4_t zt); + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 __attribute__((arm_streaming)) - void svstnt1d[_u64][_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); - + void svstnt1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 __attribute__((arm_streaming)) - void svstnt1d[_u64][_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - + void svstnt1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 __attribute__((arm_streaming)) - void svstnt1d_vnum[_u64][_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt); - + void svstnt1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x2_t zt); + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 __attribute__((arm_streaming)) - void svstnt1d_vnum[_u64][_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt); - + void svstnt1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, + svuint16x4_t zt); + // Variants are also available for _s32_x2 and _f32_x2 __attribute__((arm_streaming)) - void svstnt1h[_u16][_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); - + void svstnt1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); + // Variants are also available for _s32_x4 and _f32_x4 __attribute__((arm_streaming)) - void svstnt1h[_u16][_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - + void svstnt1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); + // Variants are also available for _s32_x2 and _f32_x2 __attribute__((arm_streaming)) - void svstnt1h_vnum[_u16][_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt); - + void svstnt1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x2_t zt); + // Variants are also available for _s32_x4 and _f32_x4 __attribute__((arm_streaming)) - void svstnt1h_vnum[_u16][_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt); - + void svstnt1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, + svuint32x4_t zt); + // Variants are also available for _s64_x2 and _f64_x2 __attribute__((arm_streaming)) - void svstnt1w[_u32][_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); - + void svstnt1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); + // Variants are also available for _s64_x4 and _f64_x4 __attribute__((arm_streaming)) - void svstnt1w[_u32][_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); - + void svstnt1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); + // Variants are also available for _s64_x2 and _f64_x2 __attribute__((arm_streaming)) - void svstnt1w_vnum[_u32][_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt); - + void svstnt1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt); + // Variants are also available for _s64_x4 and _f64_x4 __attribute__((arm_streaming)) - void svstnt1w_vnum[_u32][_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt); + void svstnt1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x4_t zt); ``` @@ -11164,7 +11144,6 @@ Move 8 bytes between general-purpose register and ZT0 uint64_t svread_lane_zt(uint64_t zt, uint64_t imm_x8_offset); - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) void svwrite_lane_zt(uint64_t zt, uint64_t imm_x8_offset, uint64_t rt); @@ -11177,12 +11156,11 @@ Spill and fill of ZT0 ``` c __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svldr_zt(uint64_t zt, const uint8_t *rn); - + void svldr_zt(uint64_t zt, const void *rn); __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svstr_zt(uint64_t zt, uint8_t *rn); + void svstr_zt(uint64_t zt, void *rn); ``` @@ -11243,80 +11221,88 @@ Move multi-vectors to/from ZA ``` c // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_hor_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_hor_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_ver_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_ver_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset); - // Variants are also available for _za64[_u64] + // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint64x2_t svread_za64[_s64]_vg1x2(uint32_t slice_base, uint64_t slice_offset); - // Variants are also available for _za64[_u64] + // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint64x4_t svread_za64[_s64]_vg1x4(uint32_t slice_base, uint64_t slice_offset); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_hor_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_hor_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, svint8x4_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_ver_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], - // _za32[_s32], _za32[_u32], _za64[_s64] and _za64[_u64] + // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], + // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_ver_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, svint8x4_t zn); - // Variants are also available for _za64[_u64] + // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_za64[_s64]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svint64x2_t zn); - // Variants are also available for _za64[_u64] + // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) void svwrite_za64[_s64]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svint64x4_t zn); @@ -11365,18 +11351,10 @@ Predicate select between predicate value or all-false Set scalar to count from predicate-as-counter - -``` c - enum sv_vgkind { - SV_VLx2 = 0, - SV_VLx4 = 1 - } -``` - ``` c // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - uint64_t svcntp_c8(svcount_t pnn, sv_vgkind vl); + uint64_t svcntp_c8(svcount_t pnn, uint64_t vl); ``` @@ -11385,24 +11363,26 @@ Set scalar to count from predicate-as-counter Multi-vector clamp to minimum/maximum vector ``` c - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _s8, _u8, _s16, _u16, _s32, _u32, _f32, + // _s64, _u64 and _f64 __attribute__((arm_streaming_compatible)) svfloat16_t svclamp[_f16](svfloat16_t zd, svfloat16_t zn, svfloat16_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x2, _single_u8_x2, + // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, + // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 __attribute__((arm_streaming)) - svfloat16x2_t svclamp[_single][_f16][_x2](svfloat16x2_t zd, svfloat16_t zn, - svfloat16_t zm); + svfloat16x2_t svclamp[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zn, + svfloat16_t zm); - // Variants are also available for _f32, _f64, _s8, _s16, _s32, _s64, _u8, _u16, - // _u32 and _u64 + // Variants are also available for _single_s8_x4, _single_u8_x4, + // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, + // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 __attribute__((arm_streaming)) - svfloat16x4_t svclamp[_single][_f16][_x4](svfloat16x4_t zd, svfloat16_t zn, - svfloat16_t zm); + svfloat16x4_t svclamp[_single_f16_x4](svfloat16x4_t zd, svfloat16_t zn, + svfloat16_t zm); ``` @@ -11411,17 +11391,17 @@ Multi-vector clamp to minimum/maximum vector Reverse 64-bit doublewords in elements ``` c - // Variants are also available for _u16, _u32 and _u64 + // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 __attribute__((arm_streaming_compatible)) svuint8_t svrevd[_u8]_m(svuint8_t zd, svbool_t pg, svuint8_t zn); - // Variants are also available for _u16, _u32 and _u64 + // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 __attribute__((arm_streaming_compatible)) svuint8_t svrevd[_u8]_x(svbool_t pg, svuint8_t zn); - // Variants are also available for _u16, _u32 and _u64 + // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 __attribute__((arm_streaming_compatible)) svuint8_t svrevd[_u8]_z(svbool_t pg, svuint8_t zn); @@ -11432,14 +11412,16 @@ Reverse 64-bit doublewords in elements Multi-vector conditionally select elements from two vectors ``` c - // Variants are also available for _u16, _u32 and _u64 + // Variants are also available for _s8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 __attribute__((arm_streaming)) - svuint8x2_t svsel[_u8][_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm); + svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm); - // Variants are also available for _u16, _u32 and _u64 + // Variants are also available for _s8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 __attribute__((arm_streaming)) - svuint8x4_t svsel[_u8][_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm); + svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm); ``` @@ -11448,14 +11430,18 @@ Multi-vector conditionally select elements from two vectors Multi-vector rounding shift left ``` c - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _single_u8_x2, _single_u16_x2, + // _single_s16_x2, _single_u32_x2, _single_s32_x2, _single_u64_x2 + // and _single_s64_x2 __attribute__((arm_streaming)) - svint8x2_t svrshl[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + svint8x2_t svrshl[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _single_u8_x4, _single_u16_x4, + // _single_s16_x4, _single_u32_x4, _single_s32_x4, _single_u64_x4 + // and _single_s64_x4 __attribute__((arm_streaming)) - svint8x4_t svrshl[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + svint8x4_t svrshl[_single_s8_x4](svint8x4_t zdn, svint8_t zm); ``` @@ -11464,14 +11450,16 @@ Multi-vector rounding shift left Multi-vector rounding shift left ``` c - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _u32_x2, _s32_x2, + // _u64_x2 and _s64_x2 __attribute__((arm_streaming)) - svint8x2_t svrshl[_s8][_x2](svint8x2_t zdn, svint8x2_t zm); + svint8x2_t svrshl[_s8_x2](svint8x2_t zdn, svint8x2_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _u32_x4, _s32_x4, + // _u64_x4 and _s64_x4 __attribute__((arm_streaming)) - svint8x4_t svrshl[_s8][_x4](svint8x4_t zdn, svint8x4_t zm); + svint8x4_t svrshl[_s8_x4](svint8x4_t zdn, svint8x4_t zm); ``` @@ -11482,22 +11470,21 @@ Multi-vector saturating rounding shift right narrow ``` c __attribute__((arm_streaming)) - svint16_t svqrshr[_s16][_x2](svint32x2_t zn, uint64_t imm_1_8); + svint16_t svqrshr[_s16_x2](svint32x2_t zn, uint64_t imm); // Variants are also available for _s16 __attribute__((arm_streaming)) - svint8_t svqrshr[_s8][_x4](svint32x4_t zn, uint64_t imm_1_8); - + svint8_t svqrshr[_s8_x4](svint32x4_t zn, uint64_t imm); __attribute__((arm_streaming)) - svuint16_t svuqrshr[_u16][_x2](svuint32x2_t zn, uint64_t imm_1_8); + svuint16_t svqrshr[_u16_x2](svuint32x2_t zn, uint64_t imm); // Variants are also available for _u16 __attribute__((arm_streaming)) - svuint8_t svuqrshr[_u8][_x4](svuint32x4_t zn, uint64_t imm_1_8); + svuint8_t svqrshr[_u8_x4](svuint32x4_t zn, uint64_t imm); ``` @@ -11506,14 +11493,19 @@ Multi-vector saturating rounding shift right narrow Multi-vector saturating rounding shift right narrow and interleave ``` c - // Variants are also available for _u16 + // Variants are also available for _u16_x2 __attribute__((arm_streaming_compatible)) - svint16_t svqrshrn[_s16][_x2](svint32x2_t zn, uint64_t imm_1_8); + svint16_t svqrshrn[_s16_x2](svint32x2_t zn, uint64_t imm); - // Variants are also available for _s16, _u8 and _u16 + // Variants are also available for _u8_x4 __attribute__((arm_streaming)) - svint8_t svqrshrn[_s8][_x4](svint32x4_t zn, uint64_t imm_1_8); + svint8_t svqrshrn[_s8_x4](svint32x4_t zn, uint64_t imm); + + + // Variants are also available for _u16_x4 + __attribute__((arm_streaming)) + svint16_t svqrshrn[_s16_x4](svint64x4_t zn, uint64_t imm); ``` @@ -11524,12 +11516,15 @@ Multi-vector saturating rounding shift right unsigned narrow ``` c __attribute__((arm_streaming)) - svuint16_t svsqrshru[_u16][_x2](svint32x2_t zn, uint64_t imm_1_8); + svuint16_t svsqrshru[_u16_x2](svint32x2_t zn, uint64_t imm); - // Variants are also available for _u16 __attribute__((arm_streaming)) - svuint8_t svsqrshru[_u8][_x4](svint32x4_t zn, uint64_t imm_1_8); + svuint8_t svsqrshru[_u8_x4](svint32x4_t zn, uint64_t imm); + + + __attribute__((arm_streaming)) + svuint16_t svsqrshru[_u16_x4](svint64x4_t zn, uint64_t imm); ``` @@ -11540,12 +11535,12 @@ Multi-vector saturating rounding shift right unsigned narrow and interleave ``` c __attribute__((arm_streaming_compatible)) - svuint16_t svsqrshrun[_u16][_x2](svint32x2_t zn, uint64_t imm_1_8); + svuint16_t svsqrshrun[_u16_x2](svint32x2_t zn, uint64_t imm); // Variants are also available for _u16 __attribute__((arm_streaming)) - svuint8_t svsqrshrun[_u8][_x4](svint32x4_t zn, uint64_t imm_1_8); + svuint8_t svsqrshrun[_u8_x4](svint32x4_t zn, uint64_t imm); ``` @@ -11554,14 +11549,16 @@ Multi-vector saturating rounding shift right unsigned narrow and interleave Multi-vector signed saturating doubling multiply high ``` c - // Variants are also available for _s16, _s32 and _s64 + // Variants are also available for _single_s16_x2, _single_s32_x2 + // and _single_s64_x2 __attribute__((arm_streaming)) - svint8x2_t svsqdmulh[_single][_s8][_x2](svint8x2_t zdn, svint8_t zm); + svint8x2_t svsqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - // Variants are also available for _s16, _s32 and _s64 + // Variants are also available for _single_s16_x4, _single_s32_x4 + // and _single_s64_x4 __attribute__((arm_streaming)) - svint8x4_t svsqdmulh[_single][_s8][_x4](svint8x4_t zdn, svint8_t zm); + svint8x4_t svsqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm); ``` @@ -11570,14 +11567,14 @@ Multi-vector signed saturating doubling multiply high Multi-vector signed saturating doubling multiply high ``` c - // Variants are also available for _s16, _s32 and _s64 + // Variants are also available for _s16_x2, _s32_x2 and _s64_x2 __attribute__((arm_streaming)) - svint8x2_t svsqdmulh[_s8][_x2](svint8x2_t zdn, svint8x2_t zm); + svint8x2_t svsqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm); - // Variants are also available for _s16, _s32 and _s64 + // Variants are also available for _s16_x4, _s32_x4 and _s64_x4 __attribute__((arm_streaming)) - svint8x4_t svsqdmulh[_s8][_x4](svint8x4_t zdn, svint8x4_t zm); + svint8x4_t svsqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm); ``` @@ -11588,42 +11585,42 @@ While (resulting in predicate-as-counter) ``` c // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilege_c8(int64_t rn, int64_t rm, sv_vgkind vl); + svcount_t svwhilege_c8(int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilegt_c8(int64_t rn, int64_t rm, sv_vgkind vl); + svcount_t svwhilegt_c8(int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilele_c8(int64_t rn, int64_t rm, sv_vgkind vl); + svcount_t svwhilele_c8(int64_t rn, int64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, sv_vgkind vl); + svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, uint64_t vl); // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) - svcount_t svwhilelt_c8(int64_t rn, int64_t rm, sv_vgkind vl); + svcount_t svwhilelt_c8(int64_t rn, int64_t rm, uint64_t vl); ``` @@ -11678,14 +11675,14 @@ While (resulting in predicate tuple) Multi-vector pack/unpack ``` c - // Variants are also available for _s32, _s64, _u16, _u32 and _u64 + // Variants are also available for _u16_x2, _u32_x2, _s32_x2, _u64_x2 and _s64_x2 __attribute__((arm_streaming)) - svint16x2_t svunpk[_s16][_x2](svint8_t zn); + svint16x2_t svunpk[_s16_x2](svint8_t zn); - // Variants are also available for _s32, _s64, _u16, _u32 and _u64 + // Variants are also available for _u16_x4, _u32_x4, _s32_x4, _u64_x4 and _s64_x4 __attribute__((arm_streaming)) - svint16x4_t svunpk[_s16][_x4](svint8x2_t zn); + svint16x4_t svunpk[_s16_x4](svint8x2_t zn); ``` @@ -11696,22 +11693,26 @@ Multi-vector zip/unzip (2 vectors) The uzipq instructions operate on quad-words, but for convenience accept all element types. ``` c - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm); @@ -11724,25 +11725,29 @@ Multi-vector zip/unzip (4 vectors) The zipq instructions operate on quad-words, but for convenience accept all element types. ``` c - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, svint8_t zn3); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, svint8_t zn3); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, svint8_t zn3); - // Variants are also available for _s16, _s32, _s64, _u8, _u16, _u32 and _u64 + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, + // _u64, _s64 and _f64 __attribute__((arm_streaming)) svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, svint8_t zn3); From ae149edc23de03ba6e8578e4824dd0bbfd141ab7 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 10 Nov 2022 17:41:09 +0000 Subject: [PATCH 03/25] Address a few more review comments I missed in the previous patch [to squash] --- main/acle.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/main/acle.md b/main/acle.md index 80d3b770..d1d65047 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9997,7 +9997,7 @@ Multi-vector vertical dot-product by indexed element. #### UMOPA, SMOPA, UMOPS, SMOPS -Integer sum of outer products and accumulate/subtract (2-way and 4-way) +Integer sum of outer products and accumulate/subtract (2-way) ``` c // Variants are also available for _za32[_u16] @@ -11468,23 +11468,20 @@ Multi-vector rounding shift left Multi-vector saturating rounding shift right narrow ``` c - + // Variants are also available for _u16_x2 __attribute__((arm_streaming)) svint16_t svqrshr[_s16_x2](svint32x2_t zn, uint64_t imm); - - // Variants are also available for _s16 + + // Variants are also available for _u8_x4 __attribute__((arm_streaming)) svint8_t svqrshr[_s8_x4](svint32x4_t zn, uint64_t imm); - + + // Variants are also available for _u16_x4 __attribute__((arm_streaming)) - svuint16_t svqrshr[_u16_x2](svuint32x2_t zn, uint64_t imm); - + svint16_t svqrshr[_s16_x4](svint64x4_t zn, uint64_t imm); - // Variants are also available for _u16 - __attribute__((arm_streaming)) - svuint8_t svqrshr[_u8_x4](svuint32x4_t zn, uint64_t imm); ``` From b61ff70264020677e545d40edf7c64978aedae50 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 28 Mar 2023 09:58:04 +0100 Subject: [PATCH 04/25] Remove trailing 'l' from svmlal/mlsl intrinsics [to squash] The 'l' is redundant because of the `_za32` or `_za64` suffixes already present in the function prototype, from which the 'long' or 'long long' can be deduced. --- main/acle.md | 274 +++++++++++++++++++++++++-------------------------- 1 file changed, 137 insertions(+), 137 deletions(-) diff --git a/main/acle.md b/main/acle.md index d1d65047..41c14ec9 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9500,8 +9500,8 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. __attributes__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` * Every argument named `slice_x2_offset` must be an integer constant @@ -9526,18 +9526,18 @@ ZA array vectors. The intrinsics model this in the following way: ``` c __attributes__((arm_streaming, arm_shared_za)) - void svmlal_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, svint8_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` ### SME2 data-processing instructions. @@ -10159,23 +10159,23 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10186,14 +10186,14 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16x2_t zm); + void svmla_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16x4_t zm); + void svmla_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10204,23 +10204,23 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmla_lane_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmla_lane_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x4_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmla_lane_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x4_t zn, + svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10231,23 +10231,23 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10258,14 +10258,14 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16x2_t zm); + void svmls_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16x4_t zm); + void svmls_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, + svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10276,23 +10276,23 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmls_lane_za32[_bf16]_vg2x1(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmls_lane_za32[_bf16]_vg2x2(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x4_t zn, - svbfloat16_t zm, uint64_t imm_idx); + void svmls_lane_za32[_bf16]_vg2x4(uint32_t slice_base, + uint64_t slice_x2_offset, svbfloat16x4_t zn, + svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10307,9 +10307,9 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, - svint8_t zm); + void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svint8_t zm); // Variants are available for: @@ -10318,9 +10318,9 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, - svint8_t zm); + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svint8_t zm); // Variants are available for: @@ -10329,39 +10329,39 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, - svint8_t zm); + void svmla[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsumlall[_single]_za32[_u8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, - svint8x2_t zn, svuint8_t zm); + void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, + svint8x2_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) - void svsumlall[_single]_za32[_u8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, - svint8x4_t zn, svuint8_t zm); + __attribute_(arm_streaming, arm_shared_za)) + void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, + svint8x4_t zn, svuint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8_t zn, svint8_t zm); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8_t zn, svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8x2_t zn, svint8_t zm); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8x2_t zn, svint8_t zm); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8x4_t zn, svint8_t zm); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, + svuint8x4_t zn, svint8_t zm); ``` @@ -10376,8 +10376,8 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10386,18 +10386,18 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8x4_t zm); + void svmla_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusmlall_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svuint8x2_t zn, svint8x2_t zm); + void svusmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svuint8x2_t zn, svint8x2_t zm); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svuint8x4_t zn, svint8x4_t zm); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svuint8x4_t zn, svint8x4_t zm); ``` @@ -10412,8 +10412,8 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, - svint8_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, + svint8_t zn, svint8_t zm, uint64_t imm_idx); // Variants are available for: @@ -10422,8 +10422,8 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); // Variants are available for: @@ -10432,44 +10432,44 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlal_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsumlall_lane_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, - svuint8_t zm, uint64_t imm_idx); + void svsumla_lane_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) - void svsumlall_lane_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, - svuint8_t zm, uint64_t imm_idx); + __attribute_(arm_streaming, arm_shared_za)) + void svsumla_lane_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) - void svsumlall_lane_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, - svuint8_t zm, uint64_t imm_idx); + __attribute_(arm_streaming, arm_shared_za)) + void svsumla_lane_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svuint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall_lane_za32[_u8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8_t zn, - svint8_t zm, uint64_t imm_idx); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla_lane_za32[_u8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8_t zn, + svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall_lane_za32[_u8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8x2_t zn, - svint8_t zm, uint64_t imm_idx); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla_lane_za32[_u8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8x2_t zn, + svint8_t zm, uint64_t imm_idx); - __attribute__((arm_streaming, arm_shared_za)) - void svusmlall_lane_za32[_u8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8x4_t zn, - svint8_t zm, uint64_t imm_idx); + __attribute_(arm_streaming, arm_shared_za)) + void svusmla_lane_za32[_u8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svuint8x4_t zn, + svint8_t zm, uint64_t imm_idx); ``` @@ -10484,9 +10484,9 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, - svint8_t zm); + void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice_base, + uint64_t slice_x4_offset, svint8_t zn, + svint8_t zm); // Variants are available for: @@ -10495,9 +10495,9 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, - svint8_t zm); + void svmls[_single]_za32[_s8]_vg4x2(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x2_t zn, + svint8_t zm); // Variants are available for: @@ -10506,9 +10506,9 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, - svint8_t zm); + void svmls[_single]_za32[_s8]_vg4x4(uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn, + svint8_t zm); ``` @@ -10523,8 +10523,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmls_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10533,8 +10533,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8x4_t zm); + void svmls_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8x4_t zm); ``` @@ -10549,8 +10549,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, - svint8_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, + svint8_t zn, svint8_t zm, uint64_t imm_idx); // Variants are available for: @@ -10559,8 +10559,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x2_t zn, svint8_t zm, uint64_t imm_idx); // Variants are available for: @@ -10569,8 +10569,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmlsl_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, + svint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` From 47ca61f987369fd68c55010cff1d347b3987d3d4 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Tue, 28 Mar 2023 10:22:53 +0100 Subject: [PATCH 05/25] Clarify the value of 'vl' operand for predicate-as-counter instructions [to squash] --- main/acle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index 41c14ec9..23d7d4bc 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11349,7 +11349,7 @@ Predicate select between predicate value or all-false #### CNTP -Set scalar to count from predicate-as-counter +Set scalar to count from predicate-as-counter. ``vl`` is expected to be 2 or 4. ``` c // Variants are also available for _c16, _c32 and _c64 @@ -11577,7 +11577,7 @@ Multi-vector signed saturating doubling multiply high #### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT -While (resulting in predicate-as-counter) +While (resulting in predicate-as-counter). ``vl`` is expected to be 2 or 4. ``` c // Variants are also available for _c16, _c32 and _c64 From 3b8a6e7ad94764182c28863aadc6a036803af2bd Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 30 Mar 2023 14:07:07 +0100 Subject: [PATCH 06/25] Rename svmlslb -> sbfmlslb and svmlslt ->svbfmlslt [to squash] This avoids a name clash with SVE intrinsics for svmlslb_f32/svmlslt_f32, and also aligns with what was done with bfmlalb/bfmlalt for SVE. --- main/acle.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main/acle.md b/main/acle.md index 23d7d4bc..15f372ab 100644 --- a/main/acle.md +++ b/main/acle.md @@ -10581,21 +10581,21 @@ BFloat16 floating-point multiply-subtract long from single-precision (top/bottom ``` c __attribute__((arm_streaming_compatible)) - svfloat32_t svmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); + svfloat32_t svbfmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); __attribute__((arm_streaming_compatible)) - svfloat32_t svmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); + svfloat32_t svbfmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); __attribute__((arm_streaming_compatible)) - svfloat32_t svmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); + svfloat32_t svbfmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); __attribute__((arm_streaming_compatible)) - svfloat32_t svmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); + svfloat32_t svbfmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, + svbfloat16_t zm, uint64_t imm_idx); ``` From d7d716d65de5661ed730577d8c9b76a0bccb534c Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 30 Mar 2023 14:08:18 +0100 Subject: [PATCH 07/25] Add tile operand to (svread|svwrite)_(hor|ver) intrinsics [to squash] For _za8, the tile can only be '0', hence the reason why the tile wasn't explicitly passed in. But there are overloaded forms for _za16, _za32, _za64 and _za128 that do require a tile, so this change adds the tile operand to all intrinsics. For _za8, it is up to the compiler to ensure that no value other than '0' can be passed. --- main/acle.md | 28 ++++++++++++++++------------ 1 file changed, 16 insertions(+), 12 deletions(-) diff --git a/main/acle.md b/main/acle.md index 15f372ab..836819d6 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11224,7 +11224,8 @@ Move multi-vectors to/from ZA // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint32_t slice_base, + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, + uint32_t slice_base, uint64_t slice_x2_offset); @@ -11232,7 +11233,8 @@ Move multi-vectors to/from ZA // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x4_t svread_hor_za8[_s8]_vg4(uint32_t slice_base, + svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, + uint32_t slice_base, uint64_t slice_x4_offset); @@ -11240,7 +11242,8 @@ Move multi-vectors to/from ZA // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_ver_za8[_s8]_vg2(uint32_t slice_base, + svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, + uint32_t slice_base, uint64_t slice_x2_offset); @@ -11248,7 +11251,8 @@ Move multi-vectors to/from ZA // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x4_t svread_ver_za8[_s8]_vg4(uint32_t slice_base, + svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, + uint32_t slice_base, uint64_t slice_x4_offset); @@ -11268,32 +11272,32 @@ Move multi-vectors to/from ZA // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, - svint8x2_t zn); + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice_base, + uint64_t slice_x2_offset, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn); + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg2(uint32_t slice_base, uint64_t slice_x2_offset, - svint8x2_t zn); + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice_base, + uint64_t slice_x2_offset, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn); + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice_base, + uint64_t slice_x4_offset, svint8x4_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] From 155977343794190e4905f5616362940c4c8399ed Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 30 Mar 2023 14:08:38 +0100 Subject: [PATCH 08/25] Type suffix in svread_za64[_s64] cannot be optional [to squash] The return type cannot be deduced from one of the operands, so must be explicitly specified in the name of the intrinsic. --- main/acle.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main/acle.md b/main/acle.md index 836819d6..3bb85392 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11256,16 +11256,16 @@ Move multi-vectors to/from ZA uint64_t slice_x4_offset); - // Variants are also available for _za64[_u64] and _za64[_f64] + // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x2_t svread_za64[_s64]_vg1x2(uint32_t slice_base, - uint64_t slice_offset); + svint64x2_t svread_za64_s64_vg1x2(uint32_t slice_base, + uint64_t slice_offset); - // Variants are also available for _za64[_u64] and _za64[_f64] + // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x4_t svread_za64[_s64]_vg1x4(uint32_t slice_base, - uint64_t slice_offset); + svint64x4_t svread_za64_s64_vg1x4(uint32_t slice_base, + uint64_t slice_offset); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], From 3e393205eb168b5a8a72edeb13f04fea1b470a49 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 30 Mar 2023 16:18:36 +0100 Subject: [PATCH 09/25] Fix the suffix for svusdot/svsudot to be consistent between _lane/_single [to squash] --- main/acle.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/main/acle.md b/main/acle.md index 3bb85392..ba1455c2 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9831,25 +9831,25 @@ Multi-vector dot-product (2-way and 4-way) __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_u8]_vg1x2(uint32_t slice_base, + void svsudot[_single]_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svint8x2_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_u8]_vg1x4(uint32_t slice_base, + void svsudot[_single]_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svint8x4_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_s8]_vg1x2(uint32_t slice_base, + void svusdot[_single]_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svuint8x2_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_s8]_vg1x4(uint32_t slice_base, + void svusdot[_single]_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, svint8_t zm); @@ -9889,12 +9889,12 @@ Multi-vector dot-product (2-way and 4-way) __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + void svusdot_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, svuint8x2_t zn, svint8x2_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + void svusdot_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, svuint8x4_t zn, svint8x4_t zm); ``` From 739b82ae4ff00ce7bd14916e65fe38c626b230d7 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Thu, 30 Mar 2023 16:19:37 +0100 Subject: [PATCH 10/25] Change name for svdot, so that it doesn't clash with SVE [to squash] The following SME2 intrinsic: svint32_t svdot[_s32](svint32_t, svint8_t, svint8_t); clashes with the one we defined for SME2: svint32_t svdot[_s32](svint32_t, svint16_t, svint16_t); This patch renames these to: svint32_t svdot[_s32_s16_s16]((svint32_t, svint16_t, svint16_t); which is unambiguous and allows for the possibility of retroactively updating the SVE intrinsics to have a similar name (svdot[_s32_s8_s8]). --- main/acle.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index ba1455c2..413ef93f 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9777,9 +9777,9 @@ Multi-vector saturating extract narrow and interleave Multi-vector dot-product (2-way and 4-way) ``` c - // Variants are also available for _s32 and _u32 + // Variants are also available for _s32_s16_s16 and _u32_u16_u16 __attribute__((arm_streaming_compatible)) - svfloat32_t svdot[_f32](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm); + svfloat32_t svdot[_f32_f16_f16](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm); ``` From 85f22ecca44106c382f78319d50f8aa8b4e6bd7b Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Wed, 26 Apr 2023 11:47:25 +0100 Subject: [PATCH 11/25] Add svcount_t forms for psel and pfalse intrinsics. The PSEL and PFALSE instructions have aliases to accept a predicate-as-counter register. It makes sense to add corresponding ACLE intrinsics for user convenience. --- main/acle.md | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index 413ef93f..f74cf57f 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11315,14 +11315,17 @@ Move multi-vectors to/from ZA #### PTRUE -Initialise predicate-as-counter to all active +Initialise predicate-as-counter to all active or all inactive. ``` c // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming)) svcount_t svptrue_c8(); - ``` + __attribute__((arm_streaming_compatible)) + svcount_t svpfalse_c(void); +``` + #### PEXT @@ -11349,6 +11352,9 @@ Predicate select between predicate value or all-false __attribute__((arm_streaming_compatible)) svbool_t svpsel_lane_b8(svbool_t pn, svbool_t pm, uint32_t idx, uint64_t imm); + // Variants are also available for _c16, _c32 and _c64 + __attribute__((arm_streaming_compatible)) + svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx, uint64_t imm); ``` #### CNTP From 18c5da294a88c3a4fbb4c8d3301c5f71293f1aff Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 28 Apr 2023 16:31:08 +0100 Subject: [PATCH 12/25] Cleanup, removed intrinsics that were added in SME [to squash] Also fixed a use of `svread_hor_za8` in the description of _vg2/_vg4 where it was missing the newly added tile operand. --- main/acle.md | 31 ++++--------------------------- 1 file changed, 4 insertions(+), 27 deletions(-) diff --git a/main/acle.md b/main/acle.md index f74cf57f..18a57af1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9483,7 +9483,8 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint32_t slice_base, + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, + uint32_t slice_base, uint64_t slice_x2_offset); ``` @@ -9636,9 +9637,9 @@ The additional '_write' suffix indicates that the operation is not accumulating, ``` -#### ADD, SUB (vectors) +#### ADD (vectors) -Multi-vector add/sub +Multi-vector add ``` c // Variants are also available for _single_u8_x2, _single_s16_x2, @@ -11348,10 +11349,6 @@ Transform a predicate-as-counter to a predicate (pair). Predicate select between predicate value or all-false ``` c - // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svbool_t svpsel_lane_b8(svbool_t pn, svbool_t pm, uint32_t idx, uint64_t imm); - // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming_compatible)) svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx, uint64_t imm); @@ -11396,26 +11393,6 @@ Multi-vector clamp to minimum/maximum vector ``` -#### REVD - -Reverse 64-bit doublewords in elements - -``` c - // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 - __attribute__((arm_streaming_compatible)) - svuint8_t svrevd[_u8]_m(svuint8_t zd, svbool_t pg, svuint8_t zn); - - - // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 - __attribute__((arm_streaming_compatible)) - svuint8_t svrevd[_u8]_x(svbool_t pg, svuint8_t zn); - - - // Variants are also available for _s8, _u16, _s16, _u32, _s32, _u64 and _s64 - __attribute__((arm_streaming_compatible)) - svuint8_t svrevd[_u8]_z(svbool_t pg, svuint8_t zn); - - ``` #### SEL From 17a474c6e5aa82d54c93d46d95b3008958754d11 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 15 May 2023 13:45:52 +0100 Subject: [PATCH 13/25] Remove intrinsics for MOVT, since these instructions are only available in debug state [to squash] The spec explicitly says: This instruction is UNDEFINED in Non-debug state See for example: https://developer.arm.com/documentation/ddi0602/2022-09/SME-Instructions/MOVT--scalar-to-ZT0---Move-8-bytes-from-general-purpose-register-to-ZT0- --- main/acle.md | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/main/acle.md b/main/acle.md index 18a57af1..2ec77175 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11135,21 +11135,6 @@ Contiguous non-temporal store of multi-vector operand ``` -#### MOVT - -Move 8 bytes between general-purpose register and ZT0 - -``` c - - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - uint64_t svread_lane_zt(uint64_t zt, uint64_t imm_x8_offset); - - - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svwrite_lane_zt(uint64_t zt, uint64_t imm_x8_offset, uint64_t rt); - - ``` - #### LDR, STR Spill and fill of ZT0 From fedbb6c6055425091e64f5688d40c9b1606d807f Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 2 Oct 2023 10:49:27 +0100 Subject: [PATCH 14/25] Merge slice base/offset parameters [to squash] --- main/acle.md | 397 ++++++++++++++++++++------------------------------- 1 file changed, 151 insertions(+), 246 deletions(-) diff --git a/main/acle.md b/main/acle.md index a1212ec2..8a8f87c1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9667,8 +9667,7 @@ ZA array vectors. The intrinsics model this in the following way: // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint32_t slice_base, - uint64_t slice_x2_offset); + uint64_t slice); ``` * Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function @@ -9684,22 +9683,10 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm, uint64_t imm_idx); ``` -* Every argument named `slice_x2_offset` must be an integer constant - expression that is a multiple of 2. - - These immediates are required for indexing a double-vector group in the ZA - array or to index a vector group of 2 tile-slices. - -* Every argument named `slice_x4_offset` must be an integer constant - expression that is a multiple of 4. - - These immediates are required for indexing a quad-vector group in the ZA - array or to index a vector group of 4 tile-slices. - * Intrinsic functions that take a multi-vector operand may have additional suffixes to distinguish them from other forms for the same intrinsic: * a `_single` suffix if they take one multi-vector operand and one @@ -9710,18 +9697,16 @@ ZA array vectors. The intrinsics model this in the following way: ``` c __attributes__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, - svint8_t zm); + void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm, uint64_t imm_idx); ``` ### SME2 data-processing instructions. @@ -9740,9 +9725,8 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, - svint32x2_t zn, svint32_t zm); + void svadd_write[_single]_za32[_s32]_vg1x2(uint64_t slice, svint32x2_t zn, + svint32_t zm); // Variants are available for: @@ -9751,27 +9735,24 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, - svint32x4_t zn, svint32_t zm); + void svadd_write[_single]_za32[_s32]_vg1x4(uint64_t slice, svint32x4_t zn, + svint32_t zm); // Variants are available for: // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, - svuint32x2_t zn, svuint32_t zm); + void svsub_write[_single]_za32[_u32]_vg1x2(uint64_t slice, svuint32x2_t zn, + svuint32_t zm); // Variants are available for: // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, - svuint32x4_t zn, svuint32_t zm); + void svsub_write[_single]_za32[_u32]_vg1x4(uint64_t slice, svuint32x4_t zn, + svuint32_t zm); ``` @@ -9789,7 +9770,7 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + void svadd_write_za32[_s32]_vg1x2(uint64_t slice, svint32x2_t zn, svint32x2_t zm); @@ -9799,7 +9780,7 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + void svadd_write_za32[_s32]_vg1x4(uint64_t slice, svint32x4_t zn, svint32x4_t zm); @@ -9807,7 +9788,7 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, + void svsub_write_za32[_u32]_vg1x2(uint64_t slice, svuint32x2_t zn, svuint32x2_t zm); @@ -9815,7 +9796,7 @@ The additional '_write' suffix indicates that the operation is not accumulating, // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, + void svsub_write_za32[_u32]_vg1x4(uint64_t slice, svuint32x4_t zn, svuint32x4_t zm); ``` @@ -9853,8 +9834,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zm); + void svadd_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9865,8 +9845,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zm); + void svadd_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); // Variants are available for: @@ -9875,8 +9854,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zm); + void svsub_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9885,8 +9863,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zm); + void svsub_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); ``` @@ -9994,9 +9971,8 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, - svbfloat16x2_t zn, svbfloat16_t zm); + void svdot[_single]_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16_t zm); // Variants are available for: @@ -10009,32 +9985,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, + void svdot[_single]_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, svbfloat16_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, svint8x2_t zn, + void svsudot[_single]_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, svint8x4_t zn, + void svsudot[_single]_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, svuint8x2_t zn, + void svusdot[_single]_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, svuint8x4_t zn, + void svusdot[_single]_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8_t zm); ``` @@ -10054,8 +10025,8 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svbfloat16x2_t zn, svbfloat16x2_t zm); + void svdot_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16x2_t zm); // Variants are available for: @@ -10068,18 +10039,16 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svbfloat16x4_t zn, svbfloat16x4_t zm); + void svdot_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svuint8x2_t zn, svint8x2_t zm); + void svusdot_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svuint8x4_t zn, svint8x4_t zm); + void svusdot_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10098,9 +10067,8 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svbfloat16x2_t zn, svbfloat16_t zm, - uint64_t imm_idx); + void svdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are available for: @@ -10113,32 +10081,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svbfloat16x4_t zn, svbfloat16_t zm, - uint64_t imm_idx); + void svdot_lane_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svint8x2_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svint8x4_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svuint8x2_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svuint8x4_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10150,22 +10113,19 @@ Multi-vector vertical dot-product by indexed element. ``` c __attribute__((arm_streaming, arm_shared_za)) - void svsuvdot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svint8x4_t zn, svuint8_t zm, - uint64_t imm_idx); + void svsuvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, + svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svuint8x4_t zn, svint8_t zm, - uint64_t imm_idx); + void svusvdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, + svint8_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_bf16]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svbfloat16x2_t zn, svbfloat16_t zm, - uint64_t imm_idx); + void svvdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16_t zm, uint64_t imm_idx); // Variants are available for: @@ -10174,8 +10134,8 @@ Multi-vector vertical dot-product by indexed element. // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_s8]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + void svvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + uint64_t imm_idx); ``` @@ -10223,8 +10183,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, svfloat32x2_t zn, + void svmla[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10232,8 +10191,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, svfloat32x4_t zn, + void svmla[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, svfloat32_t zm); @@ -10241,8 +10199,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice_base, - uint64_t slice_offset, svfloat32x2_t zn, + void svmls[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10250,8 +10207,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice_base, - uint64_t slice_offset, svfloat32x4_t zn, + void svmls[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, svfloat32_t zm); ``` @@ -10265,32 +10221,32 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zn, svfloat32x2_t zm); + void svmla_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + svfloat32x2_t zm); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zn, svfloat32x4_t zm); + void svmla_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + svfloat32x4_t zm); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zn, svfloat32x2_t zm); + void svmls_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + svfloat32x2_t zm); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zn, svfloat32x4_t zm); + void svmls_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + svfloat32x4_t zm); ``` @@ -10303,36 +10259,32 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zn, svfloat32_t zm, - uint64_t imm_idx); + void svmla_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + svfloat32_t zm, uint64_t imm_idx); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zn, svfloat32_t zm, - uint64_t imm_idx); + void svmla_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + svfloat32_t zm, uint64_t imm_idx); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svfloat32x2_t zn, svfloat32_t zm, - uint64_t imm_idx); + void svmls_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + svfloat32_t zm, uint64_t imm_idx); // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svfloat32x4_t zn, svfloat32_t zm, - uint64_t imm_idx); + void svmls_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + svfloat32_t zm, uint64_t imm_idx); ``` @@ -10343,23 +10295,20 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16_t zm); + void svmla[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16_t zm); ``` @@ -10370,14 +10319,14 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16x2_t zm); + void svmla_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16x4_t zm); + void svmla_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16x4_t zm); ``` @@ -10388,22 +10337,19 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16_t zn, + void svmla_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x2_t zn, + void svmla_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x4_t zn, + void svmla_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10415,23 +10361,20 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16_t zm); + void svmls[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16_t zm); ``` @@ -10442,14 +10385,14 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x2(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x2_t zn, svbfloat16x2_t zm); + void svmls_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x4(uint32_t slice_base, uint64_t slice_x2_offset, - svbfloat16x4_t zn, svbfloat16x4_t zm); + void svmls_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + svbfloat16x4_t zm); ``` @@ -10460,22 +10403,19 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x1(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16_t zn, + void svmls_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x2(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x2_t zn, + void svmls_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x4(uint32_t slice_base, - uint64_t slice_x2_offset, svbfloat16x4_t zn, + void svmls_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10491,8 +10431,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, + void svmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm); @@ -10502,8 +10441,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm); @@ -10513,39 +10451,33 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, + void svmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, - svint8x2_t zn, svuint8_t zm); + void svsumla[_single]_za32[_u8]_vg4x2(uint64_t slice, svint8x2_t zn, + svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, - svint8x4_t zn, svuint8_t zm); + void svsumla[_single]_za32[_u8]_vg4x4(uint64_t slice, svint8x4_t zn, + svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8_t zn, svint8_t zm); + void svusmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svuint8_t zn, + svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8x2_t zn, svint8_t zm); + void svusmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, + svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, - svuint8x4_t zn, svint8_t zm); + void svusmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, + svint8_t zm); ``` @@ -10560,8 +10492,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10570,18 +10501,15 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8x4_t zm); + void svmla_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svuint8x2_t zn, svint8x2_t zm); + void svusmla_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svuint8x4_t zn, svint8x4_t zm); + void svusmla_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10596,8 +10524,8 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, - svint8_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + uint64_t imm_idx); // Variants are available for: @@ -10606,8 +10534,8 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + uint64_t imm_idx); // Variants are available for: @@ -10616,43 +10544,37 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + void svmla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, + void svsumla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, + void svsumla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, + void svsumla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8_t zn, + void svusmla_lane_za32[_u8]_vg4x1(uint64_t slice, svuint8_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8x2_t zn, + void svusmla_lane_za32[_u8]_vg4x2(uint64_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svuint8x4_t zn, + void svusmla_lane_za32[_u8]_vg4x4(uint64_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10668,8 +10590,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice_base, - uint64_t slice_x4_offset, svint8_t zn, + void svmls[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm); @@ -10679,8 +10600,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x2(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x2_t zn, + void svmls[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm); @@ -10690,8 +10610,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x4(uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn, + void svmls[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm); ``` @@ -10707,8 +10626,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8x2_t zm); + void svmls_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10717,8 +10635,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8x4_t zm); + void svmls_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); ``` @@ -10733,8 +10650,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x1(uint32_t slice_base, uint64_t slice_x4_offset, - svint8_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + uint64_t imm_idx); // Variants are available for: @@ -10743,8 +10660,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x2(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x2_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + uint64_t imm_idx); // Variants are available for: @@ -10753,8 +10670,8 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x4(uint32_t slice_base, uint64_t slice_x4_offset, - svint8x4_t zn, svint8_t zm, uint64_t imm_idx); + void svmls_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + uint64_t imm_idx); ``` @@ -11394,8 +11311,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint32_t slice_base, - uint64_t slice_x2_offset); + uint64_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11403,8 +11319,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, - uint32_t slice_base, - uint64_t slice_x4_offset); + uint64_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11412,8 +11327,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, - uint32_t slice_base, - uint64_t slice_x2_offset); + uint64_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11421,64 +11335,55 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, - uint32_t slice_base, - uint64_t slice_x4_offset); + uint64_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x2_t svread_za64_s64_vg1x2(uint32_t slice_base, - uint64_t slice_offset); + svint64x2_t svread_za64_s64_vg1x2(uint64_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x4_t svread_za64_s64_vg1x4(uint32_t slice_base, - uint64_t slice_offset); + svint64x4_t svread_za64_s64_vg1x4(uint64_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice_base, - uint64_t slice_x2_offset, svint8x2_t zn); + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn); + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice_base, - uint64_t slice_x2_offset, svint8x2_t zn); + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice_base, - uint64_t slice_x4_offset, svint8x4_t zn); + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x2(uint32_t slice_base, uint64_t slice_offset, - svint64x2_t zn); + void svwrite_za64[_s64]_vg1x2(uint64_t slice, svint64x2_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x4(uint32_t slice_base, uint64_t slice_offset, - svint64x4_t zn); + void svwrite_za64[_s64]_vg1x4(uint64_t slice, svint64x4_t zn); ``` From 8ca7c52a970a6db141f74ef50e1b6f3b0cb50391 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 2 Oct 2023 11:35:31 +0100 Subject: [PATCH 15/25] Address @rsandifo-arm's review comments [to squash] --- main/acle.md | 325 +++++++++++++++++++++++++-------------------------- 1 file changed, 160 insertions(+), 165 deletions(-) diff --git a/main/acle.md b/main/acle.md index 8a8f87c1..ac7ab38b 100644 --- a/main/acle.md +++ b/main/acle.md @@ -8971,9 +8971,6 @@ definitions**. It specifies the following: to zero. * When the hardware supports SME2, the function has [ZT state](#zt-state). - The function's ZT state is created on entry to the function and destroyed - on return from the function. That is, the function does not use ZT0 - to receive data from callers or to pass data back to callers. This attribute does not change a function's binary interface. If the function forms part of the object code's ABI, that object code function @@ -9078,6 +9075,44 @@ that do not have the attribute. However, the reverse is not true. For example: } ``` +## SME types + +### Predicate-as-counter + +SME2 adds a new kind of predicate, named *predicate-as-counter* which is used +for multi-vector predication. It describes a predicate mask that can span +multiple predicate registers with `K` `true` values followed by all `false` +values, or `K` `false` values followed by all `true` values, for a given element +type. + +When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines a +single sizeless predicate-as-counter type named `svcount_t`. + +`svcount_t` and `svbool_t` are both used to represent predicate masks, but +they cannot be used interchangeably. + +The ACLE allows these types to be casted from one to another using the +`svcount_t svreinterpret_c(svbool_t)` and `svbool_t svreinterpret_b(svcount_t)` +intrinsics, although the reinterpreted values may not be sensible in the other +format. To safely extract a sensible mask from a `svcount_t`, the `svpext` +functions should be used. + +### Multi-vector predicates + +When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines the +tuple types `svboolx2_t` and `svboolx4_t`. + +These are opaque tuple types that can be accessed using the SVE intrinsics +`svsetN`, `svgetN` and `svcreateN`. `svundef2` and `svundef4` are also extended +to work with `svboolx2_t` and `svboolx4_t`. e.g. + +``` c + svbool_t svget2[_b](svboolx2_t tuple, uint64_t imm_index); + svboolx2_t svset2[_b](svboolx2_t tuple, uint64_t imm_index, svbool_t x); + svboolx2_t svcreate2[_b](svbool_t x, svbool_t y); + svboolx2_t svundef2_b(); +``` + ## SME functions and intrinsics [``](#arm_sme.h) declares various support functions and @@ -9167,9 +9202,8 @@ following it. --> > the compiler does not insert unnecessary code to save and restore the > current ZA contents. The call might also be useful for static analysis. -### SME instruction intrinsics -#### Common rules +### SME instruction intrinsics The intrinsics in this section have the following properties in common: @@ -9196,6 +9230,74 @@ The intrinsics in this section have the following properties in common: of a given ZA tile and a `_ver` suffix if they operate on vertical slices of a given ZA tile. + +SME2 adds operations that work on groups of SVE vectors, ZA tile slices or +ZA array vectors. The intrinsics model this in the following way: + +* Multi-vector operands are groups of SVE data vectors, that use the same + tuple types as defined in the [SVE ACLE](#sve-vector-types), e.g. + `svint32x2_t` for a multi-vector operand of two 32-bit element vectors, or + `svint64x4_t` for a multi-vector operand of four 64-bit element vectors. + +* The architecture distinguishes between multi-vector operands with + consecutive registers and multi-vector operands with strided registers. + This level of detail is not exposed to the C/C++ intrinsics or types. It is + left up to the compiler to choose the most optimal form. + +* Intrinsic functions have a `_x2` or `_x4` suffix if the + function\'s return value is a vector group of 2 or 4 data vectors + and the function operates purely on vectors, not on the matrix array or + tile slices. + +* Intrinsic functions have a `_vg2` or `_vg4` suffix if the function + operates on groups of 2 or 4 ZA tile slices. For example: + +``` c + // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. + __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice); +``` + +* Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function + operates on 2 or 4 single-vector groups within the ZA array. + +* Intrinsic functions have a `_vg2x1`, `_vg2x2`, `_vg2x4` suffix if + the function operates on 1, 2 or 4 double-vector groups within the ZA array. + +* Intrinsic functions have a `_vg4x1`, `_vg4x2`, `_vg4x4` suffix if the + function operates on 1, 2 or 4 quad-vector groups within the ZA array. + For example: + +``` c + // SMLAL intrinsic for 2 quad-vector groups. + __attributes__((arm_streaming, arm_shared_za)) + void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm, uint64_t imm_idx); +``` + +* Intrinsic functions that take a multi-vector operand may have additional + suffixes to distinguish them from other forms for the same intrinsic: + * a `_single` suffix if they take one multi-vector operand and one + (single) vector operand. + * a `_lane` suffix if they take one multi-vector operand and one + indexed vector operand with an immediate to specify the indexed + elements. + +``` c + __attributes__((arm_streaming, arm_shared_za)) + void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + + + __attributes__((arm_streaming, arm_shared_za)) + void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm); + + __attributes__((arm_streaming, arm_shared_za)) + void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + svint8_t zm, uint64_t imm_idx); +``` + + #### LD1B, LD1H, LD1W, LD1D, LD1Q ``` c @@ -9578,145 +9680,17 @@ possible to write these operations using normal C arithmetic. For example: void svzero_za() __arm_streaming_compatible __arm_shared_za; ``` -### Streaming-compatible versions of standard routines - -ACLE provides the following streaming-compatible functions, -with the same behavior as the standard C functions that they -are named after. All of the functions have external linkage. - -``` c - void *__arm_sc_memcpy(void *dest, const void *src, size_t n) - __arm_streaming_compatible __arm_preserves_za; - - void *__arm_sc_memmove(void *dest, const void *src, size_t n) - __arm_streaming_compatible __arm_preserves_za; - - void *__arm_sc_memset(void *s, int c, size_t n) - __arm_streaming_compatible __arm_preserves_za; - - void *__arm_sc_memchr(void *s, int c, size_t n) - __arm_streaming_compatible __arm_preserves_za; -``` - -## SME2 Types - -### Predicate-as-counter - -SME2 adds a new kind of predicate, named *predicate-as-counter* which is used -for multi-vector predication. It describes a predicate mask that can span multiple -predicate registers with `K` `true` values followed by all `false` values, or -`K` `false` values followed by all `true` values, for a given element type. - -When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines a -single sizeless predicate-as-counter type named `svcount_t`. - -`svcount_t` and `svbool_t` are both used to represent predicate masks, but -they cannot be used interchangeably. - -The ACLE allows these types to be casted from one to another using the -`svcount_t svreinterpret_c(svbool_t)` and `svbool_t svreinterpret_b(svcount_t)` -intrinsics, although the reinterpreted values may not be sensible in the other -format. To safely extract a sensible mask from a `svcount_t`, the `svpext` -functions should be used. - -### Multi-vector predicates - -When `__ARM_FEATURE_SME2` is defined, [``](#arm_sme.h) defines the tuple types -`svboolx2_t` and `svboolx4_t`. - -These are opaque tuple types that can be accessed using the existing SVE -intrinsics `svsetN`, `svgetN` and `svcreateN`. `svundef2` and `svundef4` -are also extended to work with `svboolx2_t` and `svboolx4_t`. e.g. - -``` c - svbool_t svget2[_b](svboolx2_t tuple, uint64_t imm_index); - svboolx2_t svset2[_b](svboolx2_t tuple, uint64_t imm_index, svbool_t x); - svboolx2_t svcreate2[_b](svbool_t x, svbool_t y); - svboolx2_t svundef2_b(); -``` - -## SME2 functions +### SME2 instruction intrinsics -The functions in this section are defined by the header file +The intrinsics in this section are defined by the header file [``](#arm_sme.h) when `__ARM_FEATURE_SME2` is defined. -#### Common rules - -SME2 adds operations that work on groups of SVE vectors, ZA tile slices or -ZA array vectors. The intrinsics model this in the following way: - -* Multi-vector operands are groups of SVE data vectors, that use the same - tuple types as defined in the [SVE ACLE](#sve-vector-types), e.g. - `svint32x2_t` for a multi-vector operand of two 32-bit element vectors, or - `svint64x4_t` for a multi-vector operand of four 64-bit element vectors. - -* The architecture distinguishes between multi-vector operands with - consecutive registers and multi-vector operands with strided registers. - This level of detail is not exposed to the C/C++ intrinsics or types. It is - left up to the compiler to choose the most optimal form. - -* Intrinsic functions have a `_x2` or `_x4` suffix if the - function\'s return value is a vector group of 2 or 4 data vectors - and the function operates purely on vectors, not on the matrix array or - tile slices. - -* Intrinsic functions have a `_vg2` or `_vg4` suffix if the function - operates on groups of 2 or 4 ZA tile slices. For example: - -``` c - // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. - __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint64_t slice); -``` - -* Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function - operates on 2 or 4 single-vector groups within a ZA array. - -* Intrinsic functions have a `_vg2x1`, `_vg2x2`, `_vg2x4` suffix if - the function operates on 1, 2 or 4 double-vector groups within a ZA array. - -* Intrinsic functions have a `_vg4x1`, `_vg4x2`, `_vg4x4` suffix if the - function operates on 1, 2 or 4 quad-vector groups within a ZA array. - For example: - -``` c - // SMLAL intrinsic for 2 quad-vector groups. - __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, - svint8_t zm, uint64_t imm_idx); -``` - -* Intrinsic functions that take a multi-vector operand may have additional - suffixes to distinguish them from other forms for the same intrinsic: - * a `_single` suffix if they take one multi-vector operand and one - (single) vector operand. - * a `_lane` suffix if they take one multi-vector operand and one - indexed vector operand with an immediate to specify the indexed - elements. - -``` c - __attributes__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); - - - __attributes__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, - svint8_t zm); - - __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, - svint8_t zm, uint64_t imm_idx); -``` - -### SME2 data-processing instructions. - #### ADD, SUB (store into ZA, single) Multi-vector add/sub, storing into ZA -The additional '_write' suffix indicates that the operation is not accumulating, - the result is written directly into ZA. +The additional '_write' suffix indicates that the operation is not accumulating; +the result is written directly into ZA. ``` c // Variants are available for: @@ -9760,8 +9734,8 @@ The additional '_write' suffix indicates that the operation is not accumulating, Multi-vector add/sub, storing into ZA -The additional '_write' suffix indicates that the operation is not accumulating, - the result is written directly into ZA. +The additional '_write' suffix indicates that the operation is not accumulating; +the result is written directly into ZA. ``` c // Variants are available for: @@ -11681,8 +11655,6 @@ While (resulting in predicate-as-counter). ``vl`` is expected to be 2 or 4. ``` -#### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT - While (resulting in predicate tuple) ``` c @@ -11743,74 +11715,97 @@ Multi-vector pack/unpack ``` -#### ZIP, UZP - -Multi-vector zip/unzip (2 vectors) +#### ZIP -The uzipq instructions operate on quad-words, but for convenience accept all element types. +Multi-vector zip. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm); + svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm); // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm); - - + svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + ``` + +The `svzipq` intrinsics operate on quad-words, but for convenience accept all +element types. + + +``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm); - + svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm); + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm); - + svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); ``` -#### ZIP, UZP - -Multi-vector zip/unzip (4 vectors) +#### UZP -The zipq instructions operate on quad-words, but for convenience accept all element types. +Multi-vector unzip. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); + svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm); // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); - + svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8_t zn3); + ``` +The `svuzpq` intrinsics operate on quad-words, but for convenience accept all +element types. + +``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); + svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm); // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 __attribute__((arm_streaming)) - svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, + svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, svint8_t zn3); ``` +### Streaming-compatible versions of standard routines + +ACLE provides the following streaming-compatible functions, +with the same behavior as the standard C functions that they +are named after. All of the functions have external linkage. + +``` c + void *__arm_sc_memcpy(void *dest, const void *src, size_t n) + __arm_streaming_compatible __arm_preserves_za; + + void *__arm_sc_memmove(void *dest, const void *src, size_t n) + __arm_streaming_compatible __arm_preserves_za; + + void *__arm_sc_memset(void *s, int c, size_t n) + __arm_streaming_compatible __arm_preserves_za; + + void *__arm_sc_memchr(void *s, int c, size_t n) + __arm_streaming_compatible __arm_preserves_za; +``` # M-profile Vector Extension (MVE) intrinsics From 6e8b68d497cbf4d225a069a7f8a6420d780b77ab Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 2 Oct 2023 15:12:37 +0100 Subject: [PATCH 16/25] Change uint64_t slice -> uint32_t slice --- main/acle.md | 212 +++++++++++++++++++++++++-------------------------- 1 file changed, 106 insertions(+), 106 deletions(-) diff --git a/main/acle.md b/main/acle.md index ac7ab38b..248bcc4c 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9255,7 +9255,7 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice); + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice); ``` * Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function @@ -9271,7 +9271,7 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9285,15 +9285,15 @@ ZA array vectors. The intrinsics model this in the following way: ``` c __attributes__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9699,7 +9699,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x2(uint64_t slice, svint32x2_t zn, + void svadd_write[_single]_za32[_s32]_vg1x2(uint32_t slice, svint32x2_t zn, svint32_t zm); @@ -9709,7 +9709,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x4(uint64_t slice, svint32x4_t zn, + void svadd_write[_single]_za32[_s32]_vg1x4(uint32_t slice, svint32x4_t zn, svint32_t zm); @@ -9717,7 +9717,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x2(uint64_t slice, svuint32x2_t zn, + void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, svuint32_t zm); @@ -9725,7 +9725,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x4(uint64_t slice, svuint32x4_t zn, + void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, svuint32_t zm); ``` @@ -9744,7 +9744,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x2(uint64_t slice, + void svadd_write_za32[_s32]_vg1x2(uint32_t slice, svint32x2_t zn, svint32x2_t zm); @@ -9754,7 +9754,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x4(uint64_t slice, + void svadd_write_za32[_s32]_vg1x4(uint32_t slice, svint32x4_t zn, svint32x4_t zm); @@ -9762,7 +9762,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x2(uint64_t slice, + void svsub_write_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, svuint32x2_t zm); @@ -9770,7 +9770,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x4(uint64_t slice, + void svsub_write_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, svuint32x4_t zm); ``` @@ -9808,7 +9808,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); + void svadd_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9819,7 +9819,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); + void svadd_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); // Variants are available for: @@ -9828,7 +9828,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); + void svsub_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9837,7 +9837,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); + void svsub_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); ``` @@ -9945,7 +9945,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot[_single]_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); @@ -9959,27 +9959,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x4(uint64_t slice, + void svdot[_single]_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, + void svsudot[_single]_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, + void svsudot[_single]_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, + void svusdot[_single]_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, + void svusdot[_single]_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm); ``` @@ -9999,7 +9999,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); @@ -10013,16 +10013,16 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + void svdot_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); + void svusdot_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); + void svusdot_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10041,7 +10041,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); @@ -10055,27 +10055,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + void svdot_lane_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10087,18 +10087,18 @@ Multi-vector vertical dot-product by indexed element. ``` c __attribute__((arm_streaming, arm_shared_za)) - void svsuvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, + void svsuvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusvdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, + void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svvdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); @@ -10108,7 +10108,7 @@ Multi-vector vertical dot-product by indexed element. // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10157,7 +10157,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10165,7 +10165,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm); @@ -10173,7 +10173,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10181,7 +10181,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm); ``` @@ -10195,7 +10195,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32x2_t zm); @@ -10203,7 +10203,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32x4_t zm); @@ -10211,7 +10211,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32x2_t zm); @@ -10219,7 +10219,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32x4_t zm); ``` @@ -10233,7 +10233,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10241,7 +10241,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10249,7 +10249,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10257,7 +10257,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm, uint64_t imm_idx); ``` @@ -10269,19 +10269,19 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10293,13 +10293,13 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10311,19 +10311,19 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmla_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10335,19 +10335,19 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10359,13 +10359,13 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10377,19 +10377,19 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmls_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10405,7 +10405,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm); @@ -10415,7 +10415,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); @@ -10425,32 +10425,32 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svuint8_t zn, + void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, + void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, + void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8_t zm); ``` @@ -10466,7 +10466,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10475,15 +10475,15 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); + void svmla_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); + void svusmla_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); + void svusmla_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10498,7 +10498,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, uint64_t imm_idx); @@ -10508,7 +10508,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); @@ -10518,37 +10518,37 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svsumla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svsumla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svsumla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x1(uint64_t slice, svuint8_t zn, + void svusmla_lane_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x2(uint64_t slice, svuint8x2_t zn, + void svusmla_lane_za32[_u8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x4(uint64_t slice, svuint8x4_t zn, + void svusmla_lane_za32[_u8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10564,7 +10564,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm); @@ -10574,7 +10574,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmls[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); @@ -10584,7 +10584,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svmls[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm); ``` @@ -10600,7 +10600,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmls_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10609,7 +10609,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); + void svmls_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); ``` @@ -10624,7 +10624,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, uint64_t imm_idx); @@ -10634,7 +10634,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); @@ -10644,7 +10644,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -11285,7 +11285,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11293,7 +11293,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11301,7 +11301,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11309,55 +11309,55 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x2_t svread_za64_s64_vg1x2(uint64_t slice); + svint64x2_t svread_za64_s64_vg1x2(uint32_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x4_t svread_za64_s64_vg1x4(uint64_t slice); + svint64x4_t svread_za64_s64_vg1x4(uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x2(uint64_t slice, svint64x2_t zn); + void svwrite_za64[_s64]_vg1x2(uint32_t slice, svint64x2_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x4(uint64_t slice, svint64x4_t zn); + void svwrite_za64[_s64]_vg1x4(uint32_t slice, svint64x4_t zn); ``` From ddf0075a4783772fe137fa813d285e3e3a295e5f Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 6 Oct 2023 08:50:38 +0100 Subject: [PATCH 17/25] Add signed variants of svsub_write_za(32|64) [to squash] --- main/acle.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/main/acle.md b/main/acle.md index 248bcc4c..51667755 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9714,7 +9714,9 @@ the result is written directly into ZA. // Variants are available for: + // _za32[_s32] // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, @@ -9722,7 +9724,9 @@ the result is written directly into ZA. // Variants are available for: + // _za32[_s32] // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, @@ -9759,7 +9763,9 @@ the result is written directly into ZA. // Variants are available for: + // _za32[_s32] // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) void svsub_write_za32[_u32]_vg1x2(uint32_t slice, @@ -9767,7 +9773,9 @@ the result is written directly into ZA. // Variants are available for: + // _za32[_s32] // _za32[_u32] + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) void svsub_write_za32[_u32]_vg1x4(uint32_t slice, From c26aa72b2f6265f13d489dcabacf63cd2ddf3a3d Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 6 Oct 2023 08:52:27 +0100 Subject: [PATCH 18/25] Combine idx and imm parameters for svpsel_lane [to squash] --- main/acle.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main/acle.md b/main/acle.md index 51667755..1e60be3b 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11406,7 +11406,7 @@ Predicate select between predicate value or all-false ``` c // Variants are also available for _c16, _c32 and _c64 __attribute__((arm_streaming_compatible)) - svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx, uint64_t imm); + svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx); ``` #### CNTP From 3f53a4fa8bb05916af32b33f747b9d7f41047cdb Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 14:55:40 +0000 Subject: [PATCH 19/25] Replace __attribute__((arm_..)) syntax with keywords [to squash] Also removes some unnecessary whitespace. --- main/acle.md | 1957 +++++++++++++++++++++++--------------------------- 1 file changed, 913 insertions(+), 1044 deletions(-) diff --git a/main/acle.md b/main/acle.md index 1e60be3b..0def379a 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9254,8 +9254,8 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. - __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice); + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; ``` * Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function @@ -9270,9 +9270,9 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. - __attributes__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svint8_t zm, uint64_t imm_idx); + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` * Intrinsic functions that take a multi-vector operand may have additional @@ -9284,17 +9284,16 @@ ZA array vectors. The intrinsics model this in the following way: elements. ``` c - __attributes__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm) + __arm_streaming __arm_shared_za; - - __attributes__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svint8_t zm); + svint8_t zm) + __arm_streaming __arm_shared_za; - __attributes__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svint8_t zm, uint64_t imm_idx); + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` @@ -9698,40 +9697,39 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svadd_write[_single]_za32[_s32]_vg1x2(uint32_t slice, svint32x2_t zn, - svint32_t zm); - - + svint32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svadd_write[_single]_za32[_s32]_vg1x4(uint32_t slice, svint32x4_t zn, - svint32_t zm); - - + svint32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, - svuint32_t zm); - - + svuint32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, - svuint32_t zm); - + svuint32_t zm) + __arm_streaming __arm_shared_za; ``` #### ADD, SUB (store into ZA, multi) @@ -9747,40 +9745,39 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svadd_write_za32[_s32]_vg1x2(uint32_t slice, - svint32x2_t zn, svint32x2_t zm); - - + svint32x2_t zn, svint32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svadd_write_za32[_s32]_vg1x4(uint32_t slice, - svint32x4_t zn, svint32x4_t zm); - - + svint32x4_t zn, svint32x4_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svsub_write_za32[_u32]_vg1x2(uint32_t slice, - svuint32x2_t zn, svuint32x2_t zm); - - + svuint32x2_t zn, svuint32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s32] // _za32[_u32] // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svsub_write_za32[_u32]_vg1x4(uint32_t slice, - svuint32x4_t zn, svuint32x4_t zm); - + svuint32x4_t zn, svuint32x4_t zm) + __arm_streaming __arm_shared_za; ``` #### ADD (vectors) @@ -9791,16 +9788,13 @@ Multi-vector add // Variants are also available for _single_u8_x2, _single_s16_x2, // _single_u16_x2, _single_s32_x2, _single_u32_x2, _single_s64_x2 and // _single_u64_x2 - __attribute__((arm_streaming)) - svint8x2_t svadd[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - - + svint8x2_t svadd[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; + + // Variants are also available for _single_u8_x4, _single_s16_x4, // _single_u16_x4, _single_s32_x4, _single_u32_x4, _single_s64_x4 and // _single_u64_x4 - __attribute__((arm_streaming)) - svint8x4_t svadd[_single_s8_x4](svint8x4_t zdn, svint8_t zm); - + svint8x4_t svadd[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; ``` #### ADD, SUB, FADD, FSUB (accumulate into ZA) @@ -9815,10 +9809,10 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); - - + void svadd_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za32[_s32] @@ -9826,27 +9820,26 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); - - + void svadd_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za32[_u32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); - - + void svsub_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za32[_u32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); - + void svsub_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm) + __arm_streaming __arm_shared_za; ``` #### BFCVTN, FCVTN @@ -9855,9 +9848,7 @@ Multi-vector floating-point convert from single-precision to interleaved half-pr ``` c // Variants are also available for _f16[_f32_x2] - __attribute__((arm_streaming)) - svbfloat16_t svcvtn_bf16[_f32_x2](svfloat32x2_t zn); - + svbfloat16_t svcvtn_bf16[_f32_x2](svfloat32x2_t zn) __arm_streaming; ``` #### FCVT, BFCVT, FCVTZS, FCVTZU, SCVTF, UCVTF @@ -9866,19 +9857,15 @@ Multi-vector convert to/from floating-point. ``` c // Variants are also available for _f16[_f32_x2] - __attribute__((arm_streaming)) - svbfloat16_t svcvt_bf16[_f32_x2](svfloat32x2_t zn); + svbfloat16_t svcvt_bf16[_f32_x2](svfloat32x2_t zn) __arm_streaming; // Variants are also available for _f32[_u32_x2], _s32[_f32_x2] and _u32[_f32_x2] - __attribute__((arm_streaming)) - svfloat32x2_t svcvt_f32[_s32_x2](svint32x2_t zn); - - + svfloat32x2_t svcvt_f32[_s32_x2](svint32x2_t zn) __arm_streaming; + + // Variants are also available for _f32[_u32_x4], _s32[_f32_x4] and _u32[_f32_x4] - __attribute__((arm_streaming)) - svfloat32x4_t svcvt_f32[_s32_x4](svint32x4_t zn); - + svfloat32x4_t svcvt_f32[_s32_x4](svint32x4_t zn) __arm_streaming; ``` #### SQCVT, SQCVTU, UQCVT @@ -9887,15 +9874,12 @@ Multi-vector saturating extract narrow ``` c // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] - __attribute__((arm_streaming)) - svint16_t svqcvt_s16[_s32_x2](svint32x2_t zn); - - + svint16_t svqcvt_s16[_s32_x2](svint32x2_t zn) __arm_streaming; + + // Variants are also available for _u8[_s32_x4], _u8[_u32_x4], _s16[_s64_x4], // _u16[_s64_x4] and _u16[_u64_x4] - __attribute__((arm_streaming)) - svint8_t svqcvt_s8[_s32_x4](svint32x4_t zn); - + svint8_t svqcvt_s8[_s32_x4](svint32x4_t zn) __arm_streaming; ``` #### SQCVTN, SQCVTUN, UQCVTN @@ -9904,15 +9888,12 @@ Multi-vector saturating extract narrow and interleave ``` c // Variants are also available for _u16[_s32_x2] and _u16[_u32_x2] - __attribute__((arm_streaming_compatible)) - svint16_t svqcvtn_s16[_s32_x2](svint32x2_t zn); - - + svint16_t svqcvtn_s16[_s32_x2](svint32x2_t zn) __arm_streaming_compatible; + + // Variants are also available for _u8[_s32_x4], _u8[_u32_x4], _s16[_s64_x4], // _u16[_s64_x4] and _u16[_u64_x4] - __attribute__((arm_streaming)) - svint8_t svqcvtn_s8[_s32_x4](svint32x4_t zn); - + svint8_t svqcvtn_s8[_s32_x4](svint32x4_t zn) __arm_streaming; ``` #### UDOT, SDOT, FDOT (vectors) @@ -9921,9 +9902,8 @@ Multi-vector dot-product (2-way and 4-way) ``` c // Variants are also available for _s32_s16_s16 and _u32_u16_u16 - __attribute__((arm_streaming_compatible)) - svfloat32_t svdot[_f32_f16_f16](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm); - + svfloat32_t svdot[_f32_f16_f16](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm) + __arm_streaming_compatible; ``` #### UDOT, SDOT, FDOT (indexed) @@ -9932,10 +9912,9 @@ Multi-vector dot-product (2-way and 4-way) ``` c // Variants are also available for _s32 and _u32 - __attribute__((arm_streaming_compatible)) svfloat32_t svdot_lane[_f32](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm, - uint64_t imm_idx); - + uint64_t imm_idx) + __arm_streaming_compatible; ``` #### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, single) @@ -9952,11 +9931,11 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot[_single]_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm); - - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_bf16] // _za32[_f16] @@ -9966,30 +9945,29 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot[_single]_za32[_bf16]_vg1x4(uint32_t slice, - svbfloat16x4_t zn, svbfloat16_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svbfloat16x4_t zn, svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + void svsudot[_single]_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, - svuint8_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svuint8_t zm) + __arm_streaming __arm_shared_za; + + void svsudot[_single]_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, - svuint8_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svuint8_t zm) + __arm_streaming __arm_shared_za; + + void svusdot[_single]_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, - svint8_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svint8_t zm) + __arm_streaming __arm_shared_za; + + void svusdot[_single]_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, - svint8_t zm); - + svint8_t zm) + __arm_streaming __arm_shared_za; ``` #### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, multi) @@ -10006,11 +9984,11 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16x2_t zm); - - + svbfloat16x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_bf16] // _za32[_f16] @@ -10020,18 +9998,17 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16x4_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); - + svbfloat16x4_t zm) + __arm_streaming __arm_shared_za; + + + void svusdot_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm) + __arm_streaming __arm_shared_za; + + + void svusdot_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm) + __arm_streaming __arm_shared_za; ``` #### FDOT, BFDOT, SUDOT, USDOT, SDOT, UDOT (store into ZA, indexed) @@ -10048,11 +10025,11 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_bf16] // _za32[_f16] @@ -10062,30 +10039,29 @@ Multi-vector dot-product (2-way and 4-way) // _za32[_u16] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svdot_lane_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8_t zm, - uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, - uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8_t zm, - uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, - uint64_t imm_idx); - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### FVDOT, BFVDOT, SUVDOT, USVDOT, SVDOT, UVDOT @@ -10093,32 +10069,30 @@ Multi-vector dot-product (2-way and 4-way) Multi-vector vertical dot-product by indexed element. ``` c - - __attribute__((arm_streaming, arm_shared_za)) void svsuvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, - svuint8_t zm, uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + svuint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, - svint8_t zm, uint64_t imm_idx); - - + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svvdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svint8_t zm, - uint64_t imm_idx); - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### UMOPA, SMOPA, UMOPS, SMOPS @@ -10127,16 +10101,15 @@ Integer sum of outer products and accumulate/subtract (2-way) ``` c // Variants are also available for _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmopa_za32[_s16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svint16_t zn, - svint16_t zm); - - + svint16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmops_za32[_s16]_m(uint64_t tile, svbool_t pn, svbool_t pm, svint16_t zn, - svint16_t zm); - + svint16_t zm) + __arm_streaming __arm_shared_za; ``` #### BMOPA, BMOPS @@ -10144,16 +10117,14 @@ Integer sum of outer products and accumulate/subtract (2-way) Bitwise exclusive NOR population count outer product and accumulate/subtract ``` c - - __attribute__((arm_streaming, arm_shared_za)) void svbmopa_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, - svuint32_t zn, svuint32_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svuint32_t zn, svuint32_t zm) + __arm_streaming __arm_shared_za; + + void svbmops_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, - svuint32_t zn, svuint32_t zm); - + svuint32_t zn, svuint32_t zm) + __arm_streaming __arm_shared_za; ``` #### FMLA, FMLS (single) @@ -10164,34 +10135,33 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32_t zm); - - + svfloat32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32_t zm); - - + svfloat32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32_t zm); - - + svfloat32_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32_t zm); - + svfloat32_t zm) + __arm_streaming __arm_shared_za; ``` #### FMLA, FMLS (multi) @@ -10202,34 +10172,33 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32x2_t zm); - - + svfloat32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32x4_t zm); - - + svfloat32x4_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32x2_t zm); - - + svfloat32x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32x4_t zm); - + svfloat32x4_t zm) + __arm_streaming __arm_shared_za; ``` #### FMLA, FMLS (indexed) @@ -10240,34 +10209,33 @@ Multi-vector floating-point fused multiply-add/subtract // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32_t zm, uint64_t imm_idx); - - + svfloat32_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32_t zm, uint64_t imm_idx); - - + svfloat32_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, - svfloat32_t zm, uint64_t imm_idx); - - + svfloat32_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, - svfloat32_t zm, uint64_t imm_idx); - + svfloat32_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### FMLAL, BFMLAL, SMLAL, UMLAL (single) @@ -10276,22 +10244,21 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm); - - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm); - - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16_t zm); - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; ``` #### FMLAL, BFMLAL, SMLAL, UMLAL (multi) @@ -10300,16 +10267,15 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16x2_t zm); - - + svbfloat16x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16x4_t zm); - + svbfloat16x4_t zm) + __arm_streaming __arm_shared_za; ``` #### FMLAL, BFMLAL, SMLAL, UMLAL (indexed) @@ -10318,22 +10284,21 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16_t zm, uint64_t imm_idx); - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### BFMLSL, FMLSL, UMLSL, SMLSL (single) @@ -10342,22 +10307,21 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm); - - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm); - - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16_t zm); - + svbfloat16_t zm) + __arm_streaming __arm_shared_za; ``` #### BFMLSL, FMLSL, UMLSL, SMLSL (multi) @@ -10366,16 +10330,15 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16x2_t zm); - - + svbfloat16x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16x4_t zm); - + svbfloat16x4_t zm) + __arm_streaming __arm_shared_za; ``` #### BFMLSL, FMLSL, UMLSL, SMLSL (indexed) @@ -10384,22 +10347,21 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, - svbfloat16_t zm, uint64_t imm_idx); - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### UMLALL, SMLALL, USMLALL, SUMLALL (single) @@ -10412,55 +10374,54 @@ Multi-vector multiply-add long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, - svint8_t zm); - - + svint8_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svint8_t zm); - - + svint8_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, - svint8_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) + svint8_t zm) + __arm_streaming __arm_shared_za; + + void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice, svint8x2_t zn, - svuint8_t zm); - - - __attribute_(arm_streaming, arm_shared_za)) + svuint8_t zm) + __arm_streaming __arm_shared_za; + + void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice, svint8x4_t zn, - svuint8_t zm); - - - __attribute_(arm_streaming, arm_shared_za)) + svuint8_t zm) + __arm_streaming __arm_shared_za; + + void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svuint8_t zn, - svint8_t zm); - - - __attribute_(arm_streaming, arm_shared_za)) + svint8_t zm) + __arm_streaming __arm_shared_za; + + void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, - svint8_t zm); - - - __attribute_(arm_streaming, arm_shared_za)) + svint8_t zm) + __arm_streaming __arm_shared_za; + + void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, - svint8_t zm); - + svint8_t zm) + __arm_streaming __arm_shared_za; ``` #### UMLALL, SMLALL, USMLALL, SUMLALL (multi) @@ -10473,26 +10434,25 @@ Multi-vector multiply-add long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); - - + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); - - - __attribute__((arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); - - - __attribute_(arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); - + void svmla_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm) + __arm_streaming __arm_shared_za; + + + void svusmla_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm) + __arm_streaming __arm_shared_za; + + + void svusmla_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm) + __arm_streaming __arm_shared_za; ``` #### UMLALL, SMLALL, USMLALL, SUMLALL (indexed) @@ -10505,60 +10465,59 @@ Multi-vector multiply-add long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, - uint64_t imm_idx); - - - __attribute__((arm_streaming, arm_shared_za)) + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svsumla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, - svuint8_t zm, uint64_t imm_idx); - - - __attribute_(arm_streaming, arm_shared_za)) + svuint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svsumla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svuint8_t zm, uint64_t imm_idx); - - - __attribute_(arm_streaming, arm_shared_za)) + svuint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svsumla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, - svuint8_t zm, uint64_t imm_idx); - - - __attribute_(arm_streaming, arm_shared_za)) + svuint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusmla_lane_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, - svint8_t zm, uint64_t imm_idx); - - - __attribute_(arm_streaming, arm_shared_za)) + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusmla_lane_za32[_u8]_vg4x2(uint32_t slice, svuint8x2_t zn, - svint8_t zm, uint64_t imm_idx); - - - __attribute_(arm_streaming, arm_shared_za)) + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + void svusmla_lane_za32[_u8]_vg4x4(uint32_t slice, svuint8x4_t zn, - svint8_t zm, uint64_t imm_idx); - + svint8_t zm, uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### SMLSLL, UMLSLL (single) @@ -10571,30 +10530,29 @@ Multi-vector multiply-subtract long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, - svint8_t zm); - - + svint8_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, - svint8_t zm); - - + svint8_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, - svint8_t zm); - + svint8_t zm) + __arm_streaming __arm_shared_za; ``` #### SMLSLL, UMLSLL (multi) @@ -10607,18 +10565,17 @@ Multi-vector multiply-subtract long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); - - + void svmls_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); - + void svmls_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm) + __arm_streaming __arm_shared_za; ``` #### SMLSLL, UMLSLL (indexed) @@ -10631,30 +10588,29 @@ Multi-vector multiply-subtract long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; + + // Variants are available for: // _za32[_s8] // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - __attribute__((arm_streaming, arm_shared_za)) void svmls_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, - uint64_t imm_idx); - + uint64_t imm_idx) + __arm_streaming __arm_shared_za; ``` #### BFMLSLB, BFMLSLT @@ -10662,24 +10618,22 @@ Multi-vector multiply-subtract long long (widening) BFloat16 floating-point multiply-subtract long from single-precision (top/bottom) ``` c - - __attribute__((arm_streaming_compatible)) - svfloat32_t svbfmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - - - __attribute__((arm_streaming_compatible)) + svfloat32_t svbfmlslb[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) + __arm_streaming_compatible; + + svfloat32_t svbfmlslb_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); - - - __attribute__((arm_streaming_compatible)) - svfloat32_t svbfmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm); - - - __attribute__((arm_streaming_compatible)) + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming_compatible; + + + svfloat32_t svbfmlslt[_f32](svfloat32_t zda, svbfloat16_t zn, svbfloat16_t zm) + __arm_streaming_compatible; + + svfloat32_t svbfmlslt_lane[_f32](svfloat32_t zda, svbfloat16_t zn, - svbfloat16_t zm, uint64_t imm_idx); - + svbfloat16_t zm, uint64_t imm_idx) + __arm_streaming_compatible; ``` #### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (single) @@ -10690,30 +10644,29 @@ Multi-vector min/max // Variants are also available for _single_s8_x2, _single_u8_x2, // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmax[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - - + svfloat16x2_t svmax[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_s8_x4, _single_u8_x4, // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmax[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - - + svfloat16x4_t svmax[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_s8_x2, _single_u8_x2, // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmin[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - - + svfloat16x2_t svmin[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_s8_x4, _single_u8_x4, // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmin[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - + svfloat16x4_t svmin[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) + __arm_streaming; ``` #### SMAX, SMIN, UMAX, UMIN, FMAX, FMIN (multi) @@ -10723,27 +10676,26 @@ Multi-vector min/max ``` c // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmax[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - - + svfloat16x2_t svmax[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) + __arm_streaming; + + // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, // _s32_x4, _u32_x4, _f32_x4, _s64_x4, _u64_x4 and _f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmax[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - - + svfloat16x4_t svmax[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) + __arm_streaming; + + // Variants are also available for _s8_x2, _u8_x2, _s16_x2, _u16_x2, // _s32_x2, _u32_x2, _f32_x2, _s64_x2, _u64_x2 and _f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmin[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - - + svfloat16x2_t svmin[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) + __arm_streaming; + + // Variants are also available for _s8_x4, _u8_x4, _s16_x4, _u16_x4, // _s32_x4, _u32_x4, _f32_x4, _s64_x4,_u64_x4 and _f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmin[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - + svfloat16x4_t svmin[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) + __arm_streaming; ``` #### FMAXNM, FMINNM (single) @@ -10752,24 +10704,23 @@ Multi-vector floating point min/max number ``` c // Variants are also available for _single_f32_x2 and _single_f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmaxnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - - + svfloat16x2_t svmaxnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_f32_x4 and _single_f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmaxnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - - + svfloat16x4_t svmaxnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_f32_x2 and _single_f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svminnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm); - - + svfloat16x2_t svminnm[_single_f16_x2](svfloat16x2_t zdn, svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_f32_x4 and _single_f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svminnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm); - + svfloat16x4_t svminnm[_single_f16_x4](svfloat16x4_t zdn, svfloat16_t zm) + __arm_streaming; ``` #### FMAXNM, FMINNM (multi) @@ -10778,24 +10729,23 @@ Multi-vector floating point min/max number ``` c // Variants are also available for _f32_x2 and _f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svmaxnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - - + svfloat16x2_t svmaxnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) + __arm_streaming; + + // Variants are also available for _f32_x4 and _f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svmaxnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - - + svfloat16x4_t svmaxnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) + __arm_streaming; + + // Variants are also available for _f32_x2 and _f64_x2 - __attribute__((arm_streaming)) - svfloat16x2_t svminnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm); - - + svfloat16x2_t svminnm[_f16_x2](svfloat16x2_t zdn, svfloat16x2_t zm) + __arm_streaming; + + // Variants are also available for _f32_x4 and _f64_x4 - __attribute__((arm_streaming)) - svfloat16x4_t svminnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm); - + svfloat16x4_t svminnm[_f16_x4](svfloat16x4_t zdn, svfloat16x4_t zm) + __arm_streaming; ``` #### FRINTA, FRINTM, FRINTN, FRINTP @@ -10803,38 +10753,28 @@ Multi-vector floating point min/max number Multi-vector floating-point round to integral value ``` c - - __attribute__((arm_streaming)) - svfloat32x2_t svrinta[_f32_x2](svfloat32x2_t zn); - - - __attribute__((arm_streaming)) - svfloat32x4_t svrinta[_f32_x4](svfloat32x4_t zn); - - - __attribute__((arm_streaming)) - svfloat32x2_t svrintm[_f32_x2](svfloat32x2_t zn); - - - __attribute__((arm_streaming)) - svfloat32x4_t svrintm[_f32_x4](svfloat32x4_t zn); - - - __attribute__((arm_streaming)) - svfloat32x2_t svrintn[_f32_x2](svfloat32x2_t zn); - - - __attribute__((arm_streaming)) - svfloat32x4_t svrintn[_f32_x4](svfloat32x4_t zn); - - - __attribute__((arm_streaming)) - svfloat32x2_t svrintp[_f32_x2](svfloat32x2_t zn); - - - __attribute__((arm_streaming)) - svfloat32x4_t svrintp[_f32_x4](svfloat32x4_t zn); - + svfloat32x2_t svrinta[_f32_x2](svfloat32x2_t zn) __arm_streaming; + + + svfloat32x4_t svrinta[_f32_x4](svfloat32x4_t zn) __arm_streaming; + + + svfloat32x2_t svrintm[_f32_x2](svfloat32x2_t zn) __arm_streaming; + + + svfloat32x4_t svrintm[_f32_x4](svfloat32x4_t zn) __arm_streaming; + + + svfloat32x2_t svrintn[_f32_x2](svfloat32x2_t zn) __arm_streaming; + + + svfloat32x4_t svrintn[_f32_x4](svfloat32x4_t zn) __arm_streaming; + + + svfloat32x2_t svrintp[_f32_x2](svfloat32x2_t zn) __arm_streaming; + + + svfloat32x4_t svrintp[_f32_x4](svfloat32x4_t zn) __arm_streaming; ``` #### LD1B, LD1D, LD1H, LD1W @@ -10842,94 +10782,92 @@ Multi-vector floating-point round to integral value Contiguous load to multi-vector ``` c - // Variants are also available for _s8 - __attribute__((arm_streaming)) - svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn); - - + svuint8x2_t svld1[_u8]_x2(svcount_t png, const uint8_t *rn) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) - svuint8x4_t svld1[_u8]_x4(svcount_t png, const uint8_t *rn); - - + svuint8x4_t svld1[_u8]_x4(svcount_t png, const uint8_t *rn) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) svuint8x2_t svld1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) svuint8x4_t svld1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) - svuint16x2_t svld1[_u16]_x2(svcount_t png, const uint16_t *rn); - - + svuint16x2_t svld1[_u16]_x2(svcount_t png, const uint16_t *rn) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) - svuint16x4_t svld1[_u16]_x4(svcount_t png, const uint16_t *rn); - - + svuint16x4_t svld1[_u16]_x4(svcount_t png, const uint16_t *rn) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) svuint16x2_t svld1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) svuint16x4_t svld1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) - svuint32x2_t svld1[_u32]_x2(svcount_t png, const uint32_t *rn); - - + svuint32x2_t svld1[_u32]_x2(svcount_t png, const uint32_t *rn) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) - svuint32x4_t svld1[_u32]_x4(svcount_t png, const uint32_t *rn); - - + svuint32x4_t svld1[_u32]_x4(svcount_t png, const uint32_t *rn) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) svuint32x2_t svld1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) svuint32x4_t svld1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) - svuint64x2_t svld1[_u64]_x2(svcount_t png, const uint64_t *rn); - - + svuint64x2_t svld1[_u64]_x2(svcount_t png, const uint64_t *rn) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) - svuint64x4_t svld1[_u64]_x4(svcount_t png, const uint64_t *rn); - - + svuint64x4_t svld1[_u64]_x4(svcount_t png, const uint64_t *rn) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) svuint64x2_t svld1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) svuint64x4_t svld1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum); - + int64_t vnum) + __arm_streaming; ``` #### LDNT1B, LDNT1D, LDNT1H, LDNT1W @@ -10937,94 +10875,92 @@ Contiguous load to multi-vector Contiguous non-temporal load to multi-vector ``` c - // Variants are also available for _s8 - __attribute__((arm_streaming)) - svuint8x2_t svldnt1[_u8]_x2(svcount_t png, const uint8_t *rn); - - + svuint8x2_t svldnt1[_u8]_x2(svcount_t png, const uint8_t *rn) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) - svuint8x4_t svldnt1[_u8]_x4(svcount_t png, const uint8_t *rn); - - + svuint8x4_t svldnt1[_u8]_x4(svcount_t png, const uint8_t *rn) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) svuint8x2_t svldnt1_vnum[_u8]_x2(svcount_t png, const uint8_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s8 - __attribute__((arm_streaming)) svuint8x4_t svldnt1_vnum[_u8]_x4(svcount_t png, const uint8_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) - svuint16x2_t svldnt1[_u16]_x2(svcount_t png, const uint16_t *rn); - - + svuint16x2_t svldnt1[_u16]_x2(svcount_t png, const uint16_t *rn) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) - svuint16x4_t svldnt1[_u16]_x4(svcount_t png, const uint16_t *rn); - - + svuint16x4_t svldnt1[_u16]_x4(svcount_t png, const uint16_t *rn) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) svuint16x2_t svldnt1_vnum[_u16]_x2(svcount_t png, const uint16_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s16, _f16 and _bf16 - __attribute__((arm_streaming)) svuint16x4_t svldnt1_vnum[_u16]_x4(svcount_t png, const uint16_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) - svuint32x2_t svldnt1[_u32]_x2(svcount_t png, const uint32_t *rn); - - + svuint32x2_t svldnt1[_u32]_x2(svcount_t png, const uint32_t *rn) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) - svuint32x4_t svldnt1[_u32]_x4(svcount_t png, const uint32_t *rn); - - + svuint32x4_t svldnt1[_u32]_x4(svcount_t png, const uint32_t *rn) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) svuint32x2_t svldnt1_vnum[_u32]_x2(svcount_t png, const uint32_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s32 and _f32 - __attribute__((arm_streaming)) svuint32x4_t svldnt1_vnum[_u32]_x4(svcount_t png, const uint32_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) - svuint64x2_t svldnt1[_u64]_x2(svcount_t png, const uint64_t *rn); - - + svuint64x2_t svldnt1[_u64]_x2(svcount_t png, const uint64_t *rn) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) - svuint64x4_t svldnt1[_u64]_x4(svcount_t png, const uint64_t *rn); - - + svuint64x4_t svldnt1[_u64]_x4(svcount_t png, const uint64_t *rn) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) svuint64x2_t svldnt1_vnum[_u64]_x2(svcount_t png, const uint64_t *rn, - int64_t vnum); - - + int64_t vnum) + __arm_streaming; + + // Variants are also available for _s64 and _f64 - __attribute__((arm_streaming)) svuint64x4_t svldnt1_vnum[_u64]_x4(svcount_t png, const uint64_t *rn, - int64_t vnum); - + int64_t vnum) + __arm_streaming; ``` #### ST1B, ST1D, ST1H, ST1W @@ -11032,94 +10968,92 @@ Contiguous non-temporal load to multi-vector Contiguous store of multi-vector operand ``` c - // Variants are also available for _s8_x2 - __attribute__((arm_streaming)) - void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); - - + void svst1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt) + __arm_streaming; + + // Variants are also available for _s8_x4 - __attribute__((arm_streaming)) - void svst1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); - - + void svst1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt) + __arm_streaming; + + // Variants are also available for _s8_x2 - __attribute__((arm_streaming)) void svst1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt); - - + svuint8x2_t zt) + __arm_streaming; + + // Variants are also available for _s8_x4 - __attribute__((arm_streaming)) void svst1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt); - - + svuint8x4_t zt) + __arm_streaming; + + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - __attribute__((arm_streaming)) - void svst1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); - - + void svst1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt) + __arm_streaming; + + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - __attribute__((arm_streaming)) - void svst1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - - + void svst1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt) + __arm_streaming; + + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - __attribute__((arm_streaming)) void svst1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt); - + svuint16x2_t zt) + __arm_streaming; + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - __attribute__((arm_streaming)) void svst1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt); - - + svuint16x4_t zt) + __arm_streaming; + + // Variants are also available for _s32_x2 and _f32_x2 - __attribute__((arm_streaming)) - void svst1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); - - + void svst1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt) + __arm_streaming; + + // Variants are also available for _s32_x4 and _f32_x4 - __attribute__((arm_streaming)) - void svst1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); - - + void svst1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt) + __arm_streaming; + + // Variants are also available for _s32_x2 and _f32_x2 - __attribute__((arm_streaming)) void svst1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt); - - + svuint32x2_t zt) + __arm_streaming; + + // Variants are also available for _s32_x4 and _f32_x4 - __attribute__((arm_streaming)) void svst1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt); - - + svuint32x4_t zt) + __arm_streaming; + + // Variants are also available for _s64_x2 and _f64_x2 - __attribute__((arm_streaming)) - void svst1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); - - + void svst1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt) + __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 - __attribute__((arm_streaming)) - void svst1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - - + void svst1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt) + __arm_streaming; + + // Variants are also available for _s64_x2 and _f64_x2 - __attribute__((arm_streaming)) void svst1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt); - - + svuint64x2_t zt) + __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 - __attribute__((arm_streaming)) void svst1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt); - + svuint64x4_t zt) + __arm_streaming; ``` #### STNT1B, STNT1D, STNT1H, STNT1W @@ -11127,94 +11061,92 @@ Contiguous store of multi-vector operand Contiguous non-temporal store of multi-vector operand ``` c - // Variants are also available for _s8_x2 - __attribute__((arm_streaming)) - void svstnt1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt); - - + void svstnt1[_u8_x2](svcount_t png, uint8_t *rn, svuint8x2_t zt) + __arm_streaming; + + // Variants are also available for _s8_x4 - __attribute__((arm_streaming)) - void svstnt1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt); - - + void svstnt1[_u8_x4](svcount_t png, uint8_t *rn, svuint8x4_t zt) + __arm_streaming; + + // Variants are also available for _s8_x2 - __attribute__((arm_streaming)) void svstnt1_vnum[_u8_x2](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x2_t zt); - - + svuint8x2_t zt) + __arm_streaming; + + // Variants are also available for _s8_x4 - __attribute__((arm_streaming)) void svstnt1_vnum[_u8_x4](svcount_t png, uint8_t *rn, int64_t vnum, - svuint8x4_t zt); - - + svuint8x4_t zt) + __arm_streaming; + + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - __attribute__((arm_streaming)) - void svstnt1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt); - - + void svstnt1[_u16_x2](svcount_t png, uint16_t *rn, svuint16x2_t zt) + __arm_streaming; + + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - __attribute__((arm_streaming)) - void svstnt1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt); - - + void svstnt1[_u16_x4](svcount_t png, uint16_t *rn, svuint16x4_t zt) + __arm_streaming; + + // Variants are also available for _s16_x2, _f16_x2 and _bf16_x2 - __attribute__((arm_streaming)) void svstnt1_vnum[_u16_x2](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x2_t zt); - - + svuint16x2_t zt) + __arm_streaming; + + // Variants are also available for _s16_x4, _f16_x4 and _bf16_x4 - __attribute__((arm_streaming)) void svstnt1_vnum[_u16_x4](svcount_t png, uint16_t *rn, int64_t vnum, - svuint16x4_t zt); - - + svuint16x4_t zt) + __arm_streaming; + + // Variants are also available for _s32_x2 and _f32_x2 - __attribute__((arm_streaming)) - void svstnt1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt); - - + void svstnt1[_u32_x2](svcount_t png, uint32_t *rn, svuint32x2_t zt) + __arm_streaming; + + // Variants are also available for _s32_x4 and _f32_x4 - __attribute__((arm_streaming)) - void svstnt1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt); - - + void svstnt1[_u32_x4](svcount_t png, uint32_t *rn, svuint32x4_t zt) + __arm_streaming; + + // Variants are also available for _s32_x2 and _f32_x2 - __attribute__((arm_streaming)) void svstnt1_vnum[_u32_x2](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x2_t zt); - - + svuint32x2_t zt) + __arm_streaming; + + // Variants are also available for _s32_x4 and _f32_x4 - __attribute__((arm_streaming)) void svstnt1_vnum[_u32_x4](svcount_t png, uint32_t *rn, int64_t vnum, - svuint32x4_t zt); - - + svuint32x4_t zt) + __arm_streaming; + + // Variants are also available for _s64_x2 and _f64_x2 - __attribute__((arm_streaming)) - void svstnt1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt); - - + void svstnt1[_u64_x2](svcount_t png, uint64_t *rn, svuint64x2_t zt) + __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 - __attribute__((arm_streaming)) - void svstnt1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt); - - + void svstnt1[_u64_x4](svcount_t png, uint64_t *rn, svuint64x4_t zt) + __arm_streaming; + + // Variants are also available for _s64_x2 and _f64_x2 - __attribute__((arm_streaming)) - void svstnt1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x2_t zt); - - + void svstnt1_vnum[_u64_x2](svcount_t png, uint64_t *rn, int64_t vnum, + svuint64x2_t zt) + __arm_streaming; + + // Variants are also available for _s64_x4 and _f64_x4 - __attribute__((arm_streaming)) void svstnt1_vnum[_u64_x4](svcount_t png, uint64_t *rn, int64_t vnum, - svuint64x4_t zt); - + svuint64x4_t zt) + __arm_streaming; ``` #### LDR, STR @@ -11222,14 +11154,12 @@ Contiguous non-temporal store of multi-vector operand Spill and fill of ZT0 ``` c - - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svldr_zt(uint64_t zt, const void *rn); - - - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svstr_zt(uint64_t zt, void *rn); - + void svldr_zt(uint64_t zt, const void *rn) + __arm_streaming_compatible __arm_shared_za __arm_preserves_za; + + + void svstr_zt(uint64_t zt, void *rn) + __arm_streaming_compatible __arm_shared_za __arm_preserves_za; ``` #### ZERO @@ -11237,10 +11167,8 @@ Spill and fill of ZT0 Zero ZT0 ``` c - - __attribute__((arm_streaming_compatible, arm_shared_za, arm_preserves_za)) - void svzero_zt(uint64_t zt); - + void svzero_zt(uint64_t zt) + __arm_streaming_compatible __arm_shared_za __arm_preserves_za; ``` #### LUTI2, LUTI4 @@ -11249,38 +11177,37 @@ Lookup table read with 2-bit and 4-bit indexes ``` c // Variants are also available for _zt[_u16] and _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svuint8_t svluti2_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx); - - + svuint8_t svluti2_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _zt[_u16] and _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svuint8x2_t svluti2_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _zt[_u16] and _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svuint8x4_t svluti2_lane_zt[_u8]_x4(uint64_t zt, svuint8_t zn, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _zt[_u16] and _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svuint8_t svluti4_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx); - - + svuint8_t svluti4_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _zt[_u16] and _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svuint8x2_t svluti4_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, - uint64_t imm_idx); - - + uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _zt[_u32] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svuint16x4_t svluti4_lane_zt[_u16]_x4(uint64_t zt, svuint16_t zn, - uint64_t imm_idx); - + uint64_t imm_idx) + __arm_streaming __arm_shared_za __arm_preserves_za; ``` #### MOVA @@ -11291,82 +11218,77 @@ Move multi-vectors to/from ZA // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint32_t slice); - - + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, - uint32_t slice); - - + svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, - uint32_t slice); - - + svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, - uint32_t slice); - - + svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za64_u64 and _za64_f64 - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x2_t svread_za64_s64_vg1x2(uint32_t slice); - - + svint64x2_t svread_za64_s64_vg1x2(uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za64_u64 and _za64_f64 - __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x4_t svread_za64_s64_vg1x4(uint32_t slice); - - + svint64x4_t svread_za64_s64_vg1x4(uint32_t slice) + __arm_streaming __arm_shared_za __arm_preserves_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); - - + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); - - + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); - - + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); - - + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za64[_u64] and _za64[_f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x2(uint32_t slice, svint64x2_t zn); - - + void svwrite_za64[_s64]_vg1x2(uint32_t slice, svint64x2_t zn) + __arm_streaming __arm_shared_za; + + // Variants are also available for _za64[_u64] and _za64[_f64] - __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x4(uint32_t slice, svint64x4_t zn); - + void svwrite_za64[_s64]_vg1x4(uint32_t slice, svint64x4_t zn) + __arm_streaming __arm_shared_za; ``` #### PTRUE @@ -11375,11 +11297,10 @@ Initialise predicate-as-counter to all active or all inactive. ``` c // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svptrue_c8(); - - __attribute__((arm_streaming_compatible)) - svcount_t svpfalse_c(void); + svcount_t svptrue_c8() __arm_streaming; + + + svcount_t svpfalse_c(void) __arm_streaming_compatible; ``` @@ -11389,14 +11310,11 @@ Transform a predicate-as-counter to a predicate (pair). ``` c // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svbool_t svpext_lane_c8(svcount_t pnn, uint64_t imm); - - + svbool_t svpext_lane_c8(svcount_t pnn, uint64_t imm) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svboolx2_t svpext_lane_c8_x2(svcount_t pnn, uint64_t imm); - + svboolx2_t svpext_lane_c8_x2(svcount_t pnn, uint64_t imm) __arm_streaming; ``` #### PSEL @@ -11405,8 +11323,8 @@ Predicate select between predicate value or all-false ``` c // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming_compatible)) - svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx); + svcount_t svpsel_lane_c8(svcount_t pn, svbool_t pm, uint32_t idx) + __arm_streaming_compatible; ``` #### CNTP @@ -11415,9 +11333,7 @@ Set scalar to count from predicate-as-counter. ``vl`` is expected to be 2 or 4. ``` c // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - uint64_t svcntp_c8(svcount_t pnn, uint64_t vl); - + uint64_t svcntp_c8(svcount_t pnn, uint64_t vl) __arm_streaming; ``` #### UCLAMP, SCLAMP, FCLAMP @@ -11427,25 +11343,24 @@ Multi-vector clamp to minimum/maximum vector ``` c // Variants are also available for _s8, _u8, _s16, _u16, _s32, _u32, _f32, // _s64, _u64 and _f64 - __attribute__((arm_streaming_compatible)) - svfloat16_t svclamp[_f16](svfloat16_t zd, svfloat16_t zn, svfloat16_t zm); - - + svfloat16_t svclamp[_f16](svfloat16_t zd, svfloat16_t zn, svfloat16_t zm) + __arm_streaming_compatible; + + // Variants are also available for _single_s8_x2, _single_u8_x2, // _single_s16_x2, _single_u16_x2, _single_s32_x2, _single_u32_x2, // _single_f32_x2, _single_s64_x2, _single_u64_x2 and _single_f64_x2 - __attribute__((arm_streaming)) svfloat16x2_t svclamp[_single_f16_x2](svfloat16x2_t zd, svfloat16_t zn, - svfloat16_t zm); - - + svfloat16_t zm) + __arm_streaming; + + // Variants are also available for _single_s8_x4, _single_u8_x4, // _single_s16_x4, _single_u16_x4, _single_s32_x4, _single_u32_x4, // _single_f32_x4, _single_s64_x4, _single_u64_x4 and _single_f64_x4 - __attribute__((arm_streaming)) svfloat16x4_t svclamp[_single_f16_x4](svfloat16x4_t zd, svfloat16_t zn, - svfloat16_t zm); - + svfloat16_t zm) + __arm_streaming; ``` @@ -11456,15 +11371,14 @@ Multi-vector conditionally select elements from two vectors ``` c // Variants are also available for _s8_x2, _u16_x2, _s16_x2, _f16_x2, // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 - __attribute__((arm_streaming)) - svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm); - - + svuint8x2_t svsel[_u8_x2](svcount_t png, svuint8x2_t zn, svuint8x2_t zm) + __arm_streaming; + + // Variants are also available for _s8_x4, _u16_x4, _s16_x4, _f16_x4, // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 - __attribute__((arm_streaming)) - svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm); - + svuint8x4_t svsel[_u8_x4](svcount_t png, svuint8x4_t zn, svuint8x4_t zm) + __arm_streaming; ``` #### URSHL, SRSHL (single) @@ -11475,16 +11389,13 @@ Multi-vector rounding shift left // Variants are also available for _single_u8_x2, _single_u16_x2, // _single_s16_x2, _single_u32_x2, _single_s32_x2, _single_u64_x2 // and _single_s64_x2 - __attribute__((arm_streaming)) - svint8x2_t svrshl[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - - + svint8x2_t svrshl[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; + + // Variants are also available for _single_u8_x4, _single_u16_x4, // _single_s16_x4, _single_u32_x4, _single_s32_x4, _single_u64_x4 // and _single_s64_x4 - __attribute__((arm_streaming)) - svint8x4_t svrshl[_single_s8_x4](svint8x4_t zdn, svint8_t zm); - + svint8x4_t svrshl[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; ``` #### URSHL, SRSHL (multi) @@ -11494,15 +11405,12 @@ Multi-vector rounding shift left ``` c // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _u32_x2, _s32_x2, // _u64_x2 and _s64_x2 - __attribute__((arm_streaming)) - svint8x2_t svrshl[_s8_x2](svint8x2_t zdn, svint8x2_t zm); - - + svint8x2_t svrshl[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; + + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _u32_x4, _s32_x4, // _u64_x4 and _s64_x4 - __attribute__((arm_streaming)) - svint8x4_t svrshl[_s8_x4](svint8x4_t zdn, svint8x4_t zm); - + svint8x4_t svrshl[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; ``` #### SQRSHR, UQRSHR @@ -11511,20 +11419,15 @@ Multi-vector saturating rounding shift right narrow ``` c // Variants are also available for _u16_x2 - __attribute__((arm_streaming)) - svint16_t svqrshr[_s16_x2](svint32x2_t zn, uint64_t imm); - + svint16_t svqrshr[_s16_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; + // Variants are also available for _u8_x4 - __attribute__((arm_streaming)) - svint8_t svqrshr[_s8_x4](svint32x4_t zn, uint64_t imm); - + svint8_t svqrshr[_s8_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; + // Variants are also available for _u16_x4 - __attribute__((arm_streaming)) - svint16_t svqrshr[_s16_x4](svint64x4_t zn, uint64_t imm); - - + svint16_t svqrshr[_s16_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` #### SQRSHRN, UQRSHRN @@ -11533,19 +11436,18 @@ Multi-vector saturating rounding shift right narrow and interleave ``` c // Variants are also available for _u16_x2 - __attribute__((arm_streaming_compatible)) - svint16_t svqrshrn[_s16_x2](svint32x2_t zn, uint64_t imm); - - + svint16_t svqrshrn[_s16_x2](svint32x2_t zn, uint64_t imm) + __arm_streaming; + + // Variants are also available for _u8_x4 - __attribute__((arm_streaming)) - svint8_t svqrshrn[_s8_x4](svint32x4_t zn, uint64_t imm); + svint8_t svqrshrn[_s8_x4](svint32x4_t zn, uint64_t imm) + __arm_streaming_compatible; // Variants are also available for _u16_x4 - __attribute__((arm_streaming)) - svint16_t svqrshrn[_s16_x4](svint64x4_t zn, uint64_t imm); - + svint16_t svqrshrn[_s16_x4](svint64x4_t zn, uint64_t imm) + __arm_streaming; ``` #### SQRSHRU @@ -11553,18 +11455,13 @@ Multi-vector saturating rounding shift right narrow and interleave Multi-vector saturating rounding shift right unsigned narrow ``` c - - __attribute__((arm_streaming)) - svuint16_t svsqrshru[_u16_x2](svint32x2_t zn, uint64_t imm); - - - __attribute__((arm_streaming)) - svuint8_t svsqrshru[_u8_x4](svint32x4_t zn, uint64_t imm); + svuint16_t svsqrshru[_u16_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; - __attribute__((arm_streaming)) - svuint16_t svsqrshru[_u16_x4](svint64x4_t zn, uint64_t imm); - + svuint8_t svsqrshru[_u8_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; + + + svuint16_t svsqrshru[_u16_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` #### SQRSHRUN @@ -11572,15 +11469,13 @@ Multi-vector saturating rounding shift right unsigned narrow Multi-vector saturating rounding shift right unsigned narrow and interleave ``` c - - __attribute__((arm_streaming_compatible)) - svuint16_t svsqrshrun[_u16_x2](svint32x2_t zn, uint64_t imm); - - + svuint16_t svsqrshrun[_u16_x2](svint32x2_t zn, uint64_t imm) + __arm_streaming_compatible; + + // Variants are also available for _u16 - __attribute__((arm_streaming)) - svuint8_t svsqrshrun[_u8_x4](svint32x4_t zn, uint64_t imm); - + svuint8_t svsqrshrun[_u8_x4](svint32x4_t zn, uint64_t imm) + __arm_streaming; ``` #### SQDMULH (single) @@ -11590,15 +11485,14 @@ Multi-vector signed saturating doubling multiply high ``` c // Variants are also available for _single_s16_x2, _single_s32_x2 // and _single_s64_x2 - __attribute__((arm_streaming)) - svint8x2_t svsqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm); - - + svint8x2_t svsqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm) + __arm_streaming; + + // Variants are also available for _single_s16_x4, _single_s32_x4 // and _single_s64_x4 - __attribute__((arm_streaming)) - svint8x4_t svsqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm); - + svint8x4_t svsqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm) + __arm_streaming; ``` #### SQDMULH (multi) @@ -11607,14 +11501,11 @@ Multi-vector signed saturating doubling multiply high ``` c // Variants are also available for _s16_x2, _s32_x2 and _s64_x2 - __attribute__((arm_streaming)) - svint8x2_t svsqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm); - - + svint8x2_t svsqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; + + // Variants are also available for _s16_x4, _s32_x4 and _s64_x4 - __attribute__((arm_streaming)) - svint8x4_t svsqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm); - + svint8x4_t svsqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; ``` #### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT @@ -11623,88 +11514,78 @@ While (resulting in predicate-as-counter). ``vl`` is expected to be 2 or 4. ``` c // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilege_c8(int64_t rn, int64_t rm, uint64_t vl); - - + svcount_t svwhilege_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilegt_c8(int64_t rn, int64_t rm, uint64_t vl); - - + svcount_t svwhilegt_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, uint64_t vl); - - + svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, uint64_t vl); - - + svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilele_c8(int64_t rn, int64_t rm, uint64_t vl); - - + svcount_t svwhilele_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, uint64_t vl); - - + svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, uint64_t vl); - - + svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + + // Variants are also available for _c16, _c32 and _c64 - __attribute__((arm_streaming)) - svcount_t svwhilelt_c8(int64_t rn, int64_t rm, uint64_t vl); - + svcount_t svwhilelt_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; ``` While (resulting in predicate tuple) ``` c // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilege_b8_x2(int64_t rn, int64_t rm); - - + svboolx2_t svwhilege_b8_x2(int64_t rn, int64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilegt_b8_x2(int64_t rn, int64_t rm); - - + svboolx2_t svwhilegt_b8_x2(int64_t rn, int64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilehi_b8_x2(uint64_t rn, uint64_t rm); - - + svboolx2_t svwhilehi_b8_x2(uint64_t rn, uint64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilehs_b8_x2(uint64_t rn, uint64_t rm); - - + svboolx2_t svwhilehs_b8_x2(uint64_t rn, uint64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilele_b8_x2(int64_t rn, int64_t rm); - - + svboolx2_t svwhilele_b8_x2(int64_t rn, int64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilelo_b8_x2(uint64_t rn, uint64_t rm); - - + svboolx2_t svwhilelo_b8_x2(uint64_t rn, uint64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilels_b8_x2(uint64_t rn, uint64_t rm); - - + svboolx2_t svwhilels_b8_x2(uint64_t rn, uint64_t rm) + __arm_streaming_compatible; + + // Variants are also available for _b16, _b32 and _b64 - __attribute__((arm_streaming_compatible)) - svboolx2_t svwhilelt_b8_x2(int64_t rn, int64_t rm); - + svboolx2_t svwhilelt_b8_x2(int64_t rn, int64_t rm) + __arm_streaming_compatible; ``` #### SUNPK, UUNPK @@ -11713,14 +11594,11 @@ Multi-vector pack/unpack ``` c // Variants are also available for _u16_x2, _u32_x2, _s32_x2, _u64_x2 and _s64_x2 - __attribute__((arm_streaming)) - svint16x2_t svunpk[_s16_x2](svint8_t zn); - - + svint16x2_t svunpk[_s16_x2](svint8_t zn) __arm_streaming; + + // Variants are also available for _u16_x4, _u32_x4, _s32_x4, _u64_x4 and _s64_x4 - __attribute__((arm_streaming)) - svint16x4_t svunpk[_s16_x4](svint8x2_t zn); - + svint16x4_t svunpk[_s16_x4](svint8x2_t zn) __arm_streaming; ``` #### ZIP @@ -11730,15 +11608,13 @@ Multi-vector zip. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) - svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm); - - + svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); + svint8_t zn3) __arm_streaming; ``` The `svzipq` intrinsics operate on quad-words, but for convenience accept all @@ -11748,15 +11624,13 @@ element types. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) - svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm); - + svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); + svint8_t zn3) __arm_streaming; ``` #### UZP @@ -11766,33 +11640,28 @@ Multi-vector unzip. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) - svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm); - - + svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); + svint8_t zn3) __arm_streaming; ``` - + The `svuzpq` intrinsics operate on quad-words, but for convenience accept all element types. ``` c // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) - svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm); - - + svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + + // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, // _u64, _s64 and _f64 - __attribute__((arm_streaming)) svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3); - + svint8_t zn3) __arm_streaming; ``` ### Streaming-compatible versions of standard routines From 6179d7e3f684352d1221946388488ef01af2c4c6 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 15:05:15 +0000 Subject: [PATCH 20/25] Add missing overloaded types [to squash] Added missing overloads for svsub, svbmopa, svluti2 and svluti4. Also some fixes: * Fixed type suffix for svsumula/svusmla (u8 -> s8 and vice versa) * Fixed type suffix for svdot[_f32_f16_f16] -> svdot[_f32_f16] * Fixed type suffix for svdot_lane[_f32] -> svdot_lane[_f32_f16] --- main/acle.md | 48 ++++++++++++++++++++++++++++++------------------ 1 file changed, 30 insertions(+), 18 deletions(-) diff --git a/main/acle.md b/main/acle.md index 0def379a..ea0d1d16 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9826,8 +9826,10 @@ Multi-vector add/sub and accumulate into ZA // Variants are available for: // _za32[_f32] + // _za32[_s32] // _za32[_u32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) void svsub_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm) __arm_streaming __arm_shared_za; @@ -9835,8 +9837,10 @@ Multi-vector add/sub and accumulate into ZA // Variants are available for: // _za32[_f32] + // _za32[_s32] // _za32[_u32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) + // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) void svsub_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm) __arm_streaming __arm_shared_za; @@ -9898,22 +9902,23 @@ Multi-vector saturating extract narrow and interleave #### UDOT, SDOT, FDOT (vectors) -Multi-vector dot-product (2-way and 4-way) +Multi-vector dot-product (2-way) ``` c - // Variants are also available for _s32_s16_s16 and _u32_u16_u16 - svfloat32_t svdot[_f32_f16_f16](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm) + // Variants are also available for _s32_s16 and _u32_u16 + svfloat32_t svdot[_f32_f16](svfloat32_t zda, svfloat16_t zn, + svfloat16_t zm) __arm_streaming_compatible; ``` #### UDOT, SDOT, FDOT (indexed) -Multi-vector dot-product (2-way and 4-way) +Multi-vector dot-product (2-way) ``` c - // Variants are also available for _s32 and _u32 - svfloat32_t svdot_lane[_f32](svfloat32_t zda, svfloat16_t zn, svfloat16_t zm, - uint64_t imm_idx) + // Variants are also available for _s32_s16 and _u32_u16 + svfloat32_t svdot_lane[_f32_f16](svfloat32_t zda, svfloat16_t zn, + svfloat16_t zm, uint64_t imm_idx) __arm_streaming_compatible; ``` @@ -10117,11 +10122,13 @@ Integer sum of outer products and accumulate/subtract (2-way) Bitwise exclusive NOR population count outer product and accumulate/subtract ``` c + // Variants are also available for _za32[_s32] void svbmopa_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_shared_za; + // Variants are also available for _za32[_s32] void svbmops_za32[_u32]_m(uint64_t tile, svbool_t pn, svbool_t pm, svuint32_t zn, svuint32_t zm) __arm_streaming __arm_shared_za; @@ -10399,27 +10406,27 @@ Multi-vector multiply-add long long (widening) __arm_streaming __arm_shared_za; - void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice, svint8x2_t zn, + void svsumla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za; - void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice, svint8x4_t zn, + void svsumla[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8_t zm) __arm_streaming __arm_shared_za; - void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svuint8_t zn, + void svusmla[_single]_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za; - void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, + void svusmla[_single]_za32[_u8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8_t zm) __arm_streaming __arm_shared_za; - void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, + void svusmla[_single]_za32[_u8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8_t zm) __arm_streaming __arm_shared_za; ``` @@ -11176,35 +11183,40 @@ Zero ZT0 Lookup table read with 2-bit and 4-bit indexes ``` c - // Variants are also available for _zt[_u16] and _zt[_u32] + // Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32] + // and _zt[_s32] svuint8_t svluti2_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; - // Variants are also available for _zt[_u16] and _zt[_u32] + // Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32] + // and _zt[_s32] svuint8x2_t svluti2_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; - // Variants are also available for _zt[_u16] and _zt[_u32] + // Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32] + // and _zt[_s32] svuint8x4_t svluti2_lane_zt[_u8]_x4(uint64_t zt, svuint8_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; - // Variants are also available for _zt[_u16] and _zt[_u32] + // Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32] + // and _zt[_s32] svuint8_t svluti4_lane_zt[_u8](uint64_t zt, svuint8_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; - // Variants are also available for _zt[_u16] and _zt[_u32] + // Variants are also available for _zt[_s8], _zt[_u16], _zt[_s16], _zt[_u32] + // and _zt[_s32] svuint8x2_t svluti4_lane_zt[_u8]_x2(uint64_t zt, svuint8_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; - // Variants are also available for _zt[_u32] + // Variants are also available for _zt[_s16], _zt[_u32] and _zt[_s32] svuint16x4_t svluti4_lane_zt[_u16]_x4(uint64_t zt, svuint16_t zn, uint64_t imm_idx) __arm_streaming __arm_shared_za __arm_preserves_za; From d2e18e5f91e8a03ecb18c0e79168d3ba7cb0c33b Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 15:07:00 +0000 Subject: [PATCH 21/25] Remove incorrect [_single] suffix [to squash] The [_single] suffix did not apply to these intrinsics. --- main/acle.md | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/main/acle.md b/main/acle.md index ea0d1d16..120b4133 100644 --- a/main/acle.md +++ b/main/acle.md @@ -10251,8 +10251,8 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm) + void svmla_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, + svbfloat16_t zm) __arm_streaming __arm_shared_za; @@ -10314,8 +10314,8 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] - void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, - svbfloat16_t zm) + void svmls_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, + svbfloat16_t zm) __arm_streaming __arm_shared_za; @@ -10381,8 +10381,7 @@ Multi-vector multiply-add long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, - svint8_t zm) + void svmla_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za; @@ -10416,8 +10415,7 @@ Multi-vector multiply-add long long (widening) __arm_streaming __arm_shared_za; - void svusmla[_single]_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, - svint8_t zm) + void svusmla_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za; @@ -10537,8 +10535,7 @@ Multi-vector multiply-subtract long long (widening) // _za32[_u8] // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) - void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, - svint8_t zm) + void svmls_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm) __arm_streaming __arm_shared_za; From 58827fe56ac1c45c350e45ecfe150b2fb828ce72 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 15:08:25 +0000 Subject: [PATCH 22/25] Add new intrinsics for svsudot and svsumla [to squash] Add new symmetrical intrinsics for svsudot (mapping to usdot, by swapping the operands) and similar for svsumla (mapping to usmla by swapping the operands) --- main/acle.md | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/main/acle.md b/main/acle.md index 120b4133..b0f71016 100644 --- a/main/acle.md +++ b/main/acle.md @@ -10008,6 +10008,14 @@ Multi-vector dot-product (2-way and 4-way) __arm_streaming __arm_shared_za; + void svsudot_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8x2_t zm) + __arm_streaming __arm_shared_za; + + + void svsudot_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8x4_t zm) + __arm_streaming __arm_shared_za; + + void svusdot_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za; @@ -10405,6 +10413,10 @@ Multi-vector multiply-add long long (widening) __arm_streaming __arm_shared_za; + void svsumla_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svuint8_t zm) + __arm_streaming __arm_shared_za; + + void svsumla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm) __arm_streaming __arm_shared_za; @@ -10452,11 +10464,19 @@ Multi-vector multiply-add long long (widening) __arm_streaming __arm_shared_za; - void svusmla_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm) + void svsumla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8x2_t zm) + __arm_streaming __arm_shared_za; + + + void svsumla_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8x4_t zm) + __arm_streaming __arm_shared_za; + + + void svusmla_za32[_u8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm) __arm_streaming __arm_shared_za; - void svusmla_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm) + void svusmla_za32[_u8_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm) __arm_streaming __arm_shared_za; ``` From 7959b34ca9901e14d3f49c56e20bc97bbb104b49 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 15:32:06 +0000 Subject: [PATCH 23/25] Apply consistent naming convention for _x2 and _x4 [to squash] We weren't entirely consistent with the use of _x2 and _x4. This also changes svzip and svuzp to take a vector tuple, rather than separate operands. --- main/acle.md | 103 +++++++++++++++++++++++++-------------------------- 1 file changed, 51 insertions(+), 52 deletions(-) diff --git a/main/acle.md b/main/acle.md index b0f71016..343ce8d1 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9245,9 +9245,10 @@ ZA array vectors. The intrinsics model this in the following way: left up to the compiler to choose the most optimal form. * Intrinsic functions have a `_x2` or `_x4` suffix if the - function\'s return value is a vector group of 2 or 4 data vectors + function\'s widest type is a vector tuple of 2 or 4 data vectors and the function operates purely on vectors, not on the matrix array or - tile slices. + tile slices. The suffix is only present on overloaded names if it cannot + be inferred from arguments. * Intrinsic functions have a `_vg2` or `_vg4` suffix if the function operates on groups of 2 or 4 ZA tile slices. For example: @@ -11447,16 +11448,16 @@ Multi-vector rounding shift left Multi-vector saturating rounding shift right narrow ``` c - // Variants are also available for _u16_x2 - svint16_t svqrshr[_s16_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; + // Variants are also available for _u8[_u32_x4] + svint8_t svqrshr_s8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _u8_x4 - svint8_t svqrshr[_s8_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; + // Variants are also available for _u16[_u32_x2] + svint16_t svqrshr_s16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _u16_x4 - svint16_t svqrshr[_s16_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; + // Variants are also available for _u16[_u64_x4] + svint16_t svqrshr_s16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` #### SQRSHRN, UQRSHRN @@ -11464,18 +11465,18 @@ Multi-vector saturating rounding shift right narrow Multi-vector saturating rounding shift right narrow and interleave ``` c - // Variants are also available for _u16_x2 - svint16_t svqrshrn[_s16_x2](svint32x2_t zn, uint64_t imm) + // Variants are also available for _u8[_u32_x4] + svint8_t svqrshrn_s8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - // Variants are also available for _u8_x4 - svint8_t svqrshrn[_s8_x4](svint32x4_t zn, uint64_t imm) + // Variants are also available for _u16[_u32_x2] + svint16_t svqrshrn_s16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming_compatible; - // Variants are also available for _u16_x4 - svint16_t svqrshrn[_s16_x4](svint64x4_t zn, uint64_t imm) + // Variants are also available for _u16[_u64_x4] + svint16_t svqrshrn_s16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` @@ -11484,13 +11485,13 @@ Multi-vector saturating rounding shift right narrow and interleave Multi-vector saturating rounding shift right unsigned narrow ``` c - svuint16_t svsqrshru[_u16_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; + svuint8_t svqrshru_u8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; - svuint8_t svsqrshru[_u8_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; + svuint16_t svqrshru_u16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming; - svuint16_t svsqrshru[_u16_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; + svuint16_t svqrshru_u16[_s64_x4](svint64x4_t zn, uint64_t imm) __arm_streaming; ``` #### SQRSHRUN @@ -11498,12 +11499,12 @@ Multi-vector saturating rounding shift right unsigned narrow Multi-vector saturating rounding shift right unsigned narrow and interleave ``` c - svuint16_t svsqrshrun[_u16_x2](svint32x2_t zn, uint64_t imm) + svuint16_t svqrshrun_u16[_s32_x2](svint32x2_t zn, uint64_t imm) __arm_streaming_compatible; - // Variants are also available for _u16 - svuint8_t svsqrshrun[_u8_x4](svint32x4_t zn, uint64_t imm) + // Variants are also available for _u16[_s64_x4] + svuint8_t svqrshrun_u8[_s32_x4](svint32x4_t zn, uint64_t imm) __arm_streaming; ``` @@ -11622,12 +11623,14 @@ While (resulting in predicate tuple) Multi-vector pack/unpack ``` c - // Variants are also available for _u16_x2, _u32_x2, _s32_x2, _u64_x2 and _s64_x2 - svint16x2_t svunpk[_s16_x2](svint8_t zn) __arm_streaming; + // Variants are also available for _u16[_u8_x2], _u32[_u16_x2], _s32[_s16_x2], + // _u64[_u32_x2] and _s64[_s32_x2] + svint16x2_t svunpk_s16[_s8_x2](svint8_t zn) __arm_streaming; - // Variants are also available for _u16_x4, _u32_x4, _s32_x4, _u64_x4 and _s64_x4 - svint16x4_t svunpk[_s16_x4](svint8x2_t zn) __arm_streaming; + // Variants are also available for _u16[_u8_x4], _u32[_u16_x4], _s32[_s16_x4], + // _u64[_u32_x4] and _s64[_s32_x4] + svint16x4_t svunpk_s16[_s8_x4](svint8x2_t zn) __arm_streaming; ``` #### ZIP @@ -11635,15 +11638,14 @@ Multi-vector pack/unpack Multi-vector zip. ``` c - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x2_t svzip[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svzip[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x4_t svzip[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3) __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svzip[_s8_x4](svint8x4_t zn) __arm_streaming; ``` The `svzipq` intrinsics operate on quad-words, but for convenience accept all @@ -11651,15 +11653,14 @@ element types. ``` c - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x2_t svzipq[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svzipq[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x4_t svzipq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3) __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svzipq[_s8_x4](svint8x4_t zn) __arm_streaming; ``` #### UZP @@ -11667,30 +11668,28 @@ element types. Multi-vector unzip. ``` c - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x2_t svuzp[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svuzp[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x4_t svuzp[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3) __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svuzp[_s8_x4](svint8x4_t zn) __arm_streaming; ``` The `svuzpq` intrinsics operate on quad-words, but for convenience accept all element types. ``` c - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x2_t svuzpq[_s8]_x2(svint8_t zn, svint8_t zm) __arm_streaming; + // Variants are also available for _u8_x2, _u16_x2, _s16_x2, _f16_x2, + // _bf16_x2, _u32_x2, _s32_x2, _f32_x2, _u64_x2, _s64_x2 and _f64_x2 + svint8x2_t svuzpq[_s8_x2](svint8x2_t zn) __arm_streaming; - // Variants are also available for _u8, _u16, _s16, _f16, _bf16, _u32, _s32, _f32, - // _u64, _s64 and _f64 - svint8x4_t svuzpq[_s8]_x4(svint8_t zn, svint8_t zn1, svint8_t zn2, - svint8_t zn3) __arm_streaming; + // Variants are also available for _u8_x4, _u16_x4, _s16_x4, _f16_x4, + // _bf16_x4, _u32_x4, _s32_x4, _f32_x4, _u64_x4, _s64_x4 and _f64_x4 + svint8x4_t svuzpq[_s8_x4](svint8x4_t zn) __arm_streaming; ``` ### Streaming-compatible versions of standard routines From d55a5f76bc95e1d0639b33927f3404d06578deb1 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 15:37:15 +0000 Subject: [PATCH 24/25] Remove leading 's' from sv[s]qdmulh [to squash] --- main/acle.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/main/acle.md b/main/acle.md index 343ce8d1..6edacf30 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11515,13 +11515,13 @@ Multi-vector signed saturating doubling multiply high ``` c // Variants are also available for _single_s16_x2, _single_s32_x2 // and _single_s64_x2 - svint8x2_t svsqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm) + svint8x2_t svqdmulh[_single_s8_x2](svint8x2_t zdn, svint8_t zm) __arm_streaming; // Variants are also available for _single_s16_x4, _single_s32_x4 // and _single_s64_x4 - svint8x4_t svsqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm) + svint8x4_t svqdmulh[_single_s8_x4](svint8x4_t zdn, svint8_t zm) __arm_streaming; ``` @@ -11531,11 +11531,11 @@ Multi-vector signed saturating doubling multiply high ``` c // Variants are also available for _s16_x2, _s32_x2 and _s64_x2 - svint8x2_t svsqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; + svint8x2_t svqdmulh[_s8_x2](svint8x2_t zdn, svint8x2_t zm) __arm_streaming; // Variants are also available for _s16_x4, _s32_x4 and _s64_x4 - svint8x4_t svsqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; + svint8x4_t svqdmulh[_s8_x4](svint8x4_t zdn, svint8x4_t zm) __arm_streaming; ``` #### WHILEGE, WHILEGT, WHILEHI, WHILEHS, WHILELE, WHILELO, WHILELS, WHILELT From cdaf16f2fbe329f8951acfe11a6ae227f753d521 Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Fri, 3 Nov 2023 17:00:53 +0000 Subject: [PATCH 25/25] Unify svwhile signed/unsigned to use the same mnemonics [to squash] --- main/acle.md | 84 ++++++++++++++++++++-------------------------------- 1 file changed, 32 insertions(+), 52 deletions(-) diff --git a/main/acle.md b/main/acle.md index 6edacf30..6a3cddfa 100644 --- a/main/acle.md +++ b/main/acle.md @@ -11543,78 +11543,58 @@ Multi-vector signed saturating doubling multiply high While (resulting in predicate-as-counter). ``vl`` is expected to be 2 or 4. ``` c - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilege_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; - - - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilegt_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; - - - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilehi_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; - - - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilehs_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; - - - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilele_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; + // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], + // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] + svcount_t svwhilege_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) + __arm_streaming; - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilelo_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], + // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] + svcount_t svwhilegt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) + __arm_streaming; - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilels_c8(uint64_t rn, uint64_t rm, uint64_t vl) __arm_streaming; + // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], + // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] + svcount_t svwhilele_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) + __arm_streaming; - // Variants are also available for _c16, _c32 and _c64 - svcount_t svwhilelt_c8(int64_t rn, int64_t rm, uint64_t vl) __arm_streaming; + // Variants are also available for _c16[_s64], _c32[_s64] _c64[_s64], + // _c8[_u64], _c16[_u64], _c32[_u64] and _c64[_u64] + svcount_t svwhilelt_c8[_s64](int64_t rn, int64_t rm, uint64_t vl) + __arm_streaming; ``` While (resulting in predicate tuple) ``` c - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilege_b8_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; - - - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilegt_b8_x2(int64_t rn, int64_t rm) - __arm_streaming_compatible; - - - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilehi_b8_x2(uint64_t rn, uint64_t rm) - __arm_streaming_compatible; - - - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilehs_b8_x2(uint64_t rn, uint64_t rm) - __arm_streaming_compatible; - - - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilele_b8_x2(int64_t rn, int64_t rm) + // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, + // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and + // _b64[_u64]_x2 + svboolx2_t svwhilege_b8[_s64]_x2(int64_t rn, int64_t rm) __arm_streaming_compatible; - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilelo_b8_x2(uint64_t rn, uint64_t rm) + // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, + // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and + // _b64[_u64]_x2 + svboolx2_t svwhilegt_b8[_s64]_x2(int64_t rn, int64_t rm) __arm_streaming_compatible; - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilels_b8_x2(uint64_t rn, uint64_t rm) + // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, + // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and + // _b64[_u64]_x2 + svboolx2_t svwhilele_b8[_s64]_x2(int64_t rn, int64_t rm) __arm_streaming_compatible; - // Variants are also available for _b16, _b32 and _b64 - svboolx2_t svwhilelt_b8_x2(int64_t rn, int64_t rm) + // Variants are also available for _b16[_s64]_x2, _b32[_s64]_x2, + // _b64[_s64]_x2, _b8[_u64]_x2, _b16[_u64]_x2, _b32[_u64]_x2 and + // _b64[_u64]_x2 + svboolx2_t svwhilelt_b8[_s64]_x2(int64_t rn, int64_t rm) __arm_streaming_compatible; ```