From 6e8b68d497cbf4d225a069a7f8a6420d780b77ab Mon Sep 17 00:00:00 2001 From: Sander de Smalen Date: Mon, 2 Oct 2023 15:12:37 +0100 Subject: [PATCH] Change uint64_t slice -> uint32_t slice --- main/acle.md | 212 +++++++++++++++++++++++++-------------------------- 1 file changed, 106 insertions(+), 106 deletions(-) diff --git a/main/acle.md b/main/acle.md index ac7ab38b..248bcc4c 100644 --- a/main/acle.md +++ b/main/acle.md @@ -9255,7 +9255,7 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // Reads 2 consecutive horizontal tile slices from ZA into multi-vector. __attributes__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice); + svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice); ``` * Intrinsic functions have a `_vg1x2`, `_vg1x4` suffix if the function @@ -9271,7 +9271,7 @@ ZA array vectors. The intrinsics model this in the following way: ``` c // SMLAL intrinsic for 2 quad-vector groups. __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9285,15 +9285,15 @@ ZA array vectors. The intrinsics model this in the following way: ``` c __attributes__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); __attributes__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -9699,7 +9699,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x2(uint64_t slice, svint32x2_t zn, + void svadd_write[_single]_za32[_s32]_vg1x2(uint32_t slice, svint32x2_t zn, svint32_t zm); @@ -9709,7 +9709,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write[_single]_za32[_s32]_vg1x4(uint64_t slice, svint32x4_t zn, + void svadd_write[_single]_za32[_s32]_vg1x4(uint32_t slice, svint32x4_t zn, svint32_t zm); @@ -9717,7 +9717,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x2(uint64_t slice, svuint32x2_t zn, + void svsub_write[_single]_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, svuint32_t zm); @@ -9725,7 +9725,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write[_single]_za32[_u32]_vg1x4(uint64_t slice, svuint32x4_t zn, + void svsub_write[_single]_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, svuint32_t zm); ``` @@ -9744,7 +9744,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x2(uint64_t slice, + void svadd_write_za32[_s32]_vg1x2(uint32_t slice, svint32x2_t zn, svint32x2_t zm); @@ -9754,7 +9754,7 @@ the result is written directly into ZA. // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_write_za32[_s32]_vg1x4(uint64_t slice, + void svadd_write_za32[_s32]_vg1x4(uint32_t slice, svint32x4_t zn, svint32x4_t zm); @@ -9762,7 +9762,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x2(uint64_t slice, + void svsub_write_za32[_u32]_vg1x2(uint32_t slice, svuint32x2_t zn, svuint32x2_t zm); @@ -9770,7 +9770,7 @@ the result is written directly into ZA. // _za32[_u32] // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_write_za32[_u32]_vg1x4(uint64_t slice, + void svsub_write_za32[_u32]_vg1x4(uint32_t slice, svuint32x4_t zn, svuint32x4_t zm); ``` @@ -9808,7 +9808,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); + void svadd_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9819,7 +9819,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_s64] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svadd_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); + void svadd_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); // Variants are available for: @@ -9828,7 +9828,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zm); + void svsub_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zm); // Variants are available for: @@ -9837,7 +9837,7 @@ Multi-vector add/sub and accumulate into ZA // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) // _za64[_u64] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svsub_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zm); + void svsub_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zm); ``` @@ -9945,7 +9945,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot[_single]_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); @@ -9959,27 +9959,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot[_single]_za32[_bf16]_vg1x4(uint64_t slice, + void svdot[_single]_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, + void svsudot[_single]_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsudot[_single]_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, + void svsudot[_single]_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, + void svusdot[_single]_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot[_single]_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, + void svusdot[_single]_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm); ``` @@ -9999,7 +9999,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); @@ -10013,16 +10013,16 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + void svdot_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); + void svusdot_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); + void svusdot_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10041,7 +10041,7 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); @@ -10055,27 +10055,27 @@ Multi-vector dot-product (2-way and 4-way) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svdot_lane_za32[_bf16]_vg1x4(uint64_t slice, svbfloat16x4_t zn, + void svdot_lane_za32[_bf16]_vg1x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x2(uint64_t slice, svint8x2_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x2(uint32_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsudot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svuint8_t zm, + void svsudot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x2(uint64_t slice, svuint8x2_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x2(uint32_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, svint8_t zm, + void svusdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10087,18 +10087,18 @@ Multi-vector vertical dot-product by indexed element. ``` c __attribute__((arm_streaming, arm_shared_za)) - void svsuvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, + void svsuvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svusvdot_lane_za32[_u8]_vg1x4(uint64_t slice, svuint8x4_t zn, + void svusvdot_lane_za32[_u8]_vg1x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_bf16]_vg1x2(uint64_t slice, svbfloat16x2_t zn, + void svvdot_lane_za32[_bf16]_vg1x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); @@ -10108,7 +10108,7 @@ Multi-vector vertical dot-product by indexed element. // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svvdot_lane_za32[_s8]_vg1x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svvdot_lane_za32[_s8]_vg1x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10157,7 +10157,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10165,7 +10165,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm); @@ -10173,7 +10173,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls[_single]_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm); @@ -10181,7 +10181,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls[_single]_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm); ``` @@ -10195,7 +10195,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32x2_t zm); @@ -10203,7 +10203,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32x4_t zm); @@ -10211,7 +10211,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32x2_t zm); @@ -10219,7 +10219,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32x4_t zm); ``` @@ -10233,7 +10233,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmla_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10241,7 +10241,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmla_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10249,7 +10249,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x2(uint64_t slice, svfloat32x2_t zn, + void svmls_lane_za32[_f32]_vg1x2(uint32_t slice, svfloat32x2_t zn, svfloat32_t zm, uint64_t imm_idx); @@ -10257,7 +10257,7 @@ Multi-vector floating-point fused multiply-add/subtract // _za32[_f32] // _za64[_f64] (only if __ARM_FEATURE_SME_F64F64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_f32]_vg1x4(uint64_t slice, svfloat32x4_t zn, + void svmls_lane_za32[_f32]_vg1x4(uint32_t slice, svfloat32x4_t zn, svfloat32_t zm, uint64_t imm_idx); ``` @@ -10269,19 +10269,19 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmla[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10293,13 +10293,13 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10311,19 +10311,19 @@ Multi-vector multiply-add long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmla_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmla_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmla_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10335,19 +10335,19 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmls[_single]_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls[_single]_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls[_single]_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm); ``` @@ -10359,13 +10359,13 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16x2_t zm); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16x4_t zm); ``` @@ -10377,19 +10377,19 @@ Multi-vector multiply-subtract long (widening) ``` c // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x1(uint64_t slice, svbfloat16_t zn, + void svmls_lane_za32[_bf16]_vg2x1(uint32_t slice, svbfloat16_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x2(uint64_t slice, svbfloat16x2_t zn, + void svmls_lane_za32[_bf16]_vg2x2(uint32_t slice, svbfloat16x2_t zn, svbfloat16_t zm, uint64_t imm_idx); // Variants are also available for _za32[_f16], _za32[_s16] and _za32[_u16] __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_bf16]_vg2x4(uint64_t slice, svbfloat16x4_t zn, + void svmls_lane_za32[_bf16]_vg2x4(uint32_t slice, svbfloat16x4_t zn, svbfloat16_t zm, uint64_t imm_idx); ``` @@ -10405,7 +10405,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm); @@ -10415,7 +10415,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); @@ -10425,32 +10425,32 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svsumla[_single]_za32[_u8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svsumla[_single]_za32[_u8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svsumla[_single]_za32[_u8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x1(uint64_t slice, svuint8_t zn, + void svusmla[_single]_za32[_s8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, + void svusmla[_single]_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla[_single]_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, + void svusmla[_single]_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8_t zm); ``` @@ -10466,7 +10466,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmla_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10475,15 +10475,15 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); + void svmla_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); __attribute__((arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x2(uint64_t slice, svuint8x2_t zn, svint8x2_t zm); + void svusmla_za32[_s8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8x2_t zm); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_za32[_s8]_vg4x4(uint64_t slice, svuint8x4_t zn, svint8x4_t zm); + void svusmla_za32[_s8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8x4_t zm); ``` @@ -10498,7 +10498,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, uint64_t imm_idx); @@ -10508,7 +10508,7 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); @@ -10518,37 +10518,37 @@ Multi-vector multiply-add long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svmla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); __attribute__((arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svsumla_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svsumla_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svsumla_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svsumla_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svuint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x1(uint64_t slice, svuint8_t zn, + void svusmla_lane_za32[_u8]_vg4x1(uint32_t slice, svuint8_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x2(uint64_t slice, svuint8x2_t zn, + void svusmla_lane_za32[_u8]_vg4x2(uint32_t slice, svuint8x2_t zn, svint8_t zm, uint64_t imm_idx); __attribute_(arm_streaming, arm_shared_za)) - void svusmla_lane_za32[_u8]_vg4x4(uint64_t slice, svuint8x4_t zn, + void svusmla_lane_za32[_u8]_vg4x4(uint32_t slice, svuint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -10564,7 +10564,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, + void svmls[_single]_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm); @@ -10574,7 +10574,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, + void svmls[_single]_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm); @@ -10584,7 +10584,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls[_single]_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, + void svmls[_single]_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm); ``` @@ -10600,7 +10600,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8x2_t zm); + void svmls_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8x2_t zm); // Variants are available for: @@ -10609,7 +10609,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8x4_t zm); + void svmls_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8x4_t zm); ``` @@ -10624,7 +10624,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x1(uint64_t slice, svint8_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x1(uint32_t slice, svint8_t zn, svint8_t zm, uint64_t imm_idx); @@ -10634,7 +10634,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x2(uint64_t slice, svint8x2_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x2(uint32_t slice, svint8x2_t zn, svint8_t zm, uint64_t imm_idx); @@ -10644,7 +10644,7 @@ Multi-vector multiply-subtract long long (widening) // _za64[_s16] (only if __ARM_FEATURE_SME_I16I64 != 0) // _za64[_u16] (only if __ARM_FEATURE_SME_I16I64 != 0) __attribute__((arm_streaming, arm_shared_za)) - void svmls_lane_za32[_s8]_vg4x4(uint64_t slice, svint8x4_t zn, svint8_t zm, + void svmls_lane_za32[_s8]_vg4x4(uint32_t slice, svint8x4_t zn, svint8_t zm, uint64_t imm_idx); ``` @@ -11285,7 +11285,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_hor_za8[_s8]_vg2(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11293,7 +11293,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_hor_za8[_s8]_vg4(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11301,7 +11301,7 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x2_t svread_ver_za8[_s8]_vg2(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], @@ -11309,55 +11309,55 @@ Move multi-vectors to/from ZA // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) svint8x4_t svread_ver_za8[_s8]_vg4(uint64_t tile, - uint64_t slice); + uint32_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x2_t svread_za64_s64_vg1x2(uint64_t slice); + svint64x2_t svread_za64_s64_vg1x2(uint32_t slice); // Variants are also available for _za64_u64 and _za64_f64 __attribute__((arm_streaming, arm_shared_za, arm_preserves_za)) - svint64x4_t svread_za64_s64_vg1x4(uint64_t slice); + svint64x4_t svread_za64_s64_vg1x4(uint32_t slice); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); + void svwrite_hor_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); + void svwrite_hor_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint64_t slice, svint8x2_t zn); + void svwrite_ver_za8[_s8]_vg2(uint64_t tile, uint32_t slice, svint8x2_t zn); // Variants are also available for _za8[_u8], _za16[_s16], _za16[_u16], // _za16[_f16], _za16[_bf16], _za32[_s32], _za32[_u32], _za32[_f32], // _za64[_s64], _za64[_u64] and _za64_[f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint64_t slice, svint8x4_t zn); + void svwrite_ver_za8[_s8]_vg4(uint64_t tile, uint32_t slice, svint8x4_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x2(uint64_t slice, svint64x2_t zn); + void svwrite_za64[_s64]_vg1x2(uint32_t slice, svint64x2_t zn); // Variants are also available for _za64[_u64] and _za64[_f64] __attribute__((arm_streaming, arm_shared_za)) - void svwrite_za64[_s64]_vg1x4(uint64_t slice, svint64x4_t zn); + void svwrite_za64[_s64]_vg1x4(uint32_t slice, svint64x4_t zn); ```