diff --git a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex index ae6ade9817c..69e8acf3dab 100644 --- a/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/allasm-scattergather-basic-counts-aarch64.templatex @@ -3,34 +3,34 @@ Hello, world! Basic counts tool results: Total counts: #ifdef __ARM_FEATURE_SVE2 - 772 total \(fetched\) instructions - 286 total unique \(fetched\) instructions + 997 total \(fetched\) instructions + 361 total unique \(fetched\) instructions #else - 733 total \(fetched\) instructions - 271 total unique \(fetched\) instructions + 958 total \(fetched\) instructions + 346 total unique \(fetched\) instructions #endif 0 total non-fetched instructions 0 total prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1248 total data loads + 1547 total data loads 873 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2234 total data loads + 2757 total data loads 1615 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4206 total data loads + 5177 total data loads 3099 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1227 total data loads + 1526 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2199 total data loads + 2722 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4143 total data loads + 5114 total data loads 3063 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ @@ -41,34 +41,34 @@ Total counts: .* Thread .* counts: #ifdef __ARM_FEATURE_SVE2 - 772 \(fetched\) instructions - 286 unique \(fetched\) instructions + 997 \(fetched\) instructions + 361 unique \(fetched\) instructions #else - 733 \(fetched\) instructions - 271 unique \(fetched\) instructions + 958 \(fetched\) instructions + 346 unique \(fetched\) instructions #endif 0 non-fetched instructions 0 prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1248 data loads + 1547 data loads 873 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2234 data loads + 2757 data loads 1615 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4206 data loads + 5177 data loads 3099 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1227 data loads + 1526 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2199 data loads + 2722 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4143 data loads + 5114 data loads 3063 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ diff --git a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm index 08462f8799c..21d47223cbf 100644 --- a/clients/drcachesim/tests/allasm_scattergather_aarch64.asm +++ b/clients/drcachesim/tests/allasm_scattergather_aarch64.asm @@ -87,6 +87,13 @@ test_scalar_plus_vector: ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 14 + // ldff1b scalar+vector + ldff1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 + ldff1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 + ldff1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 14 // ld1sb scalar+vector ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 @@ -94,6 +101,13 @@ test_scalar_plus_vector: ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 14 + // ldff1sb scalar+vector + ldff1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 + ldff1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 + ldff1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 14 // ld1h scalar+vector ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #1] // 4 ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #1] // 4 @@ -106,6 +120,18 @@ test_scalar_plus_vector: ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #1] // 2 ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 28 + // ldff1h scalar+vector + ldff1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #1] // 4 + ldff1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #1] // 4 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #1] // 2 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #1] // 2 + ldff1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 + ldff1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #1] // 2 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 28 // ld1sh scalar+vector ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #1] // 4 ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #1] // 4 @@ -118,6 +144,18 @@ test_scalar_plus_vector: ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #1] // 2 ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 28 + // ldff1sh scalar+vector + ldff1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #1] // 4 + ldff1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #1] // 4 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #1] // 2 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #1] // 2 + ldff1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 + ldff1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #1] // 2 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 28 // ld1w scalar+vector ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #2] // 4 ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #2] // 4 @@ -130,6 +168,18 @@ test_scalar_plus_vector: ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #2] // 2 ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 28 + // ldff1w scalar+vector + ldff1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw #2] // 4 + ldff1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw #2] // 4 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #2] // 2 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #2] // 2 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, uxtw] // 4 + ldff1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, S_INDEX_REG.s, sxtw] // 4 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #2] // 2 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 28 // ld1sw scalar+vector ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #2] // 2 ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #2] // 2 @@ -138,6 +188,14 @@ test_scalar_plus_vector: ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #2] // 2 ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 12 + // ldff1sw scalar+vector + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #2] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #2] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #2] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 12 // ld1d scalar+vector ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #3] // 2 ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #3] // 2 @@ -146,7 +204,17 @@ test_scalar_plus_vector: ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #3] // 2 ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 // Total: 12 - // Total loads: 14 + 14 + 28 + 28 + 28 + 12 + 12 = 136 + // ldff1d scalar+vector + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw #3] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw #3] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, sxtw] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d, lsl #3] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, D_INDEX_REG.d] // 2 + // Total: 12 + // Total loads: 14 + 14 + 14 + 14 + 28 + 28 + 28 + 28 + 28 + 28 + // + 12 + 12 + 12 + 12 + // = 272 // st1b scalar+vector st1b SRC_REG1.d, D_MASK_REG, [BUFFER_REG, D_INDEX_REG.d, uxtw] // 2 @@ -195,14 +263,21 @@ test_scalar_plus_vector: test_vector_plus_immediate: ld1b DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1b DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1sb DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1sb DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1h DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1h DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1sh DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1w DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1w DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1sw DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 ld1d DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [Z_BASE_REG.d, #0] // 2 - // Total loads: 14 + // Total loads: 28 st1b SRC_REG1.d, D_MASK_REG, [Z_BASE_REG.d, #0] // 2 st1h SRC_REG1.d, D_MASK_REG, [Z_BASE_REG.d, #0] // 2 @@ -218,23 +293,39 @@ test_scalar_plus_scalar: ld1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 8 ld1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 4 ld1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 2 + ldff1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 16 + ldff1b DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 8 + ldff1b DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 4 + ldff1b DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 2 ldnt1b DEST_REG1.b, B_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 16 ld1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 8 ld1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 4 ld1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 2 + ldff1sb DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 8 + ldff1sb DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 4 + ldff1sb DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 2 ld1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 8 ld1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 4 ld1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 2 + ldff1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 8 + ldff1h DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 4 + ldff1h DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 2 ldnt1h DEST_REG1.h, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 8 ld1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 4 ld1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 2 + ldff1sh DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 4 + ldff1sh DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 2 ld1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 4 ld1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 2 + ldff1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 4 + ldff1w DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 2 ldnt1w DEST_REG1.s, S_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 4 ld1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 2 + ldff1sw DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #2] // 2 ld1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 2 + ldff1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 2 ldnt1d DEST_REG1.d, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 2 - // Total: 104 + // Total: 178 ld2b {DEST_REG1.b, DEST_REG2.b}, B_MASK_REG/z, [BUFFER_REG, X_INDEX_REG] // 32 ld2h {DEST_REG1.h, DEST_REG2.h}, H_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #1] // 16 @@ -254,7 +345,7 @@ test_scalar_plus_scalar: ld4d {DEST_REG1.d, DEST_REG2.d, DEST_REG3.d, DEST_REG4.d}, D_MASK_REG/z, [BUFFER_REG, X_INDEX_REG, lsl #3] // 8 // Total: 120 - // Total loads: 104 + 60 + 90 + 120 = 374 + // Total loads: 178 + 60 + 90 + 120 = 448 st1b DEST_REG1.b, B_MASK_REG, [BUFFER_REG, X_INDEX_REG] // 16 st1b DEST_REG1.h, H_MASK_REG, [BUFFER_REG, X_INDEX_REG] // 8 @@ -448,13 +539,13 @@ _start: // The expanded code should have one load/store per element per register. // The total number of loads/stores depends on the current vector length. - bl test_scalar_plus_vector // +(136 * vl_bytes/16) loads + bl test_scalar_plus_vector // +(272 * vl_bytes/16) loads // +(82 * vl_bytes/16) stores - bl test_vector_plus_immediate // +(14 * vl_bytes/16) loads + bl test_vector_plus_immediate // +(28 * vl_bytes/16) loads // +(8 * vl_bytes/16) stores - bl test_scalar_plus_scalar // +(374 * vl_bytes/16) loads + bl test_scalar_plus_scalar // +(448 * vl_bytes/16) loads // +(322 * vl_bytes/16) stores bl test_scalar_plus_immediate // +(448 * vl_bytes/16) loads @@ -467,11 +558,11 @@ _start: #endif // Running total: // SVE only: - // Loads: (136 + 14 + 374 + 448) * vl_bytes/16 + 60 = 972 * vl_bytes/16 + 60 + // Loads: (272 + 28 + 448 + 448) * vl_bytes/16 + 60 = 1196 * vl_bytes/16 + 60 // Stores: (82 + 8 + 322 + 322) * vl_bytes/16 = 734 * vl_bytes/16 // Including SVE2: - // Loads: ((972 + 14) * vl_bytes/16) + 60 = (986 * vl_bytes/16) + 60 + // Loads: ((1196 + 14) * vl_bytes/16) + 60 = (1210 * vl_bytes/16) + 60 // Stores: (734 + 8) * vl_bytes/16 = 742 * vl_bytes/16 /* Run all the instructions with no active elements */ @@ -492,11 +583,11 @@ _start: // Running total (unchanged from above): // SVE only: - // Loads: (972 * vl_bytes/16) + 60 + // Loads: (1196 * vl_bytes/16) + 60 // Stores: 734 * vl_bytes/16 // Including SVE2: - // Loads: (986 * vl_bytes/16) + 60 + // Loads: (1210 * vl_bytes/16) + 60 // Stores: 742 * vl_bytes/16 /* Run all instructions with one active element */ @@ -505,9 +596,9 @@ _start: ptrue S_MASK_REG.s, VL1 ptrue D_MASK_REG.d, VL1 - bl test_scalar_plus_vector // +52 loads, +31 stores - bl test_vector_plus_immediate // +7 loads, +4 stores - bl test_scalar_plus_scalar // +56 loads, +46 stores + bl test_scalar_plus_vector // +104 loads, +31 stores + bl test_vector_plus_immediate // +14 loads, +4 stores + bl test_scalar_plus_scalar // +72 loads, +46 stores bl test_scalar_plus_immediate // +72 loads, +46 stores bl test_replicating_loads // +8 loads, +0 stores #ifdef __ARM_FEATURE_SVE2 @@ -516,39 +607,39 @@ _start: // Running total: // SVE only: - // Loads: (972 * vl_bytes/16) + 60 + 52 + 7 + 56 + 72 + 8 = (972 * vl_bytes/16) + 255 + // Loads: (1196 * vl_bytes/16) + 60 + 104 + 14 + 72 + 72 + 8 = (1182 * vl_bytes/16) + 330 // Stores: (734 * vl_bytes/16) + 41 + 4 + 46 + 46 = (734 * vl_bytes/16) + 127 // Including SVE2: - // Loads: (986 * vl_bytes/16) + 255 + 7 = (986 * vl_bytes/16) + 262 + // Loads: (1210 * vl_bytes/16) + 330 + 7 = (1210 * vl_bytes/16) + 337 // Stores: (742 * vl_bytes/16) + 127 + 4 = (742 * vl_bytes/16) + 131 // The functions in this file have the following instructions counts: // _start 40 (+3 SVE2) - // test_scalar_plus_vector 84 - // test_vector_plus_immediate 12 - // test_scalar_plus_scalar 55 + // test_scalar_plus_vector 136 + // test_vector_plus_immediate 19 + // test_scalar_plus_scalar 71 // test_scalar_plus_immediate 71 // test_replicating_loads 9 // test_vector_plus_scalar 12 - // So there are 40 + 84 + 12 + 55 + 71 + 9 = 271 unique instructions - // (or 271 + 12 + 3 = 286 including SVE2) + // So there are 40 + 136 + 19 + 71 + 71 + 9 = 346 unique instructions + // (or 346 + 18 + 3 = 367 including SVE2) // We run the test_* functions 3 times each so the total instruction executed is - // ((84 + 12 + 55 + 71 + 9) * 3) + 40 = (231 * 3) + 37 = 733 - // (or 733 + 3 + (12 * 3) = 772 including SVE2) + // ((136 + 19 + 71 + 71 + 9) * 3) + 40 = (306 * 3) + 37 = 958 + // (or 958 + 3 + (12 * 3) = 997 including SVE2) // Totals: // SVE only: - // Loads: (972 * vl_bytes/16) + 255 + // Loads: (1196 * vl_bytes/16) + 330 // Stores: (734 * vl_bytes/16) + 127 - // Instructions: 733 - // Unique instructions: 271 + // Instructions: 958 + // Unique instructions: 346 // Including SVE2: - // Loads: (986 * vl_bytes/16) + 262 + // Loads: (1210 * vl_bytes/16) + 337 // Stores: (742 * vl_bytes/16) + 131 - // Instructions: 772 - // Unique instructions: 286 + // Instructions: 997 + // Unique instructions: 367 // Exit. mov w0, #1 // stdout diff --git a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex index 180aedd55d5..df721751648 100644 --- a/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex +++ b/clients/drcachesim/tests/offline-allasm-scattergather-basic-counts-aarch64.templatex @@ -2,34 +2,34 @@ Hello, world! Basic counts tool results: Total counts: #ifdef __ARM_FEATURE_SVE2 - 772 total \(fetched\) instructions - 286 total unique \(fetched\) instructions + 997 total \(fetched\) instructions + 361 total unique \(fetched\) instructions #else - 733 total \(fetched\) instructions - 271 total unique \(fetched\) instructions + 958 total \(fetched\) instructions + 346 total unique \(fetched\) instructions #endif 0 total non-fetched instructions 0 total prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1248 total data loads + 1547 total data loads 873 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2234 total data loads + 2757 total data loads 1615 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4206 total data loads + 5177 total data loads 3099 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1227 total data loads + 1526 total data loads 861 total data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2199 total data loads + 2722 total data loads 1595 total data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4143 total data loads + 5114 total data loads 3063 total data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ @@ -40,35 +40,35 @@ Total counts: .* Thread .* counts: #ifdef __ARM_FEATURE_SVE2 - 772 \(fetched\) instructions - 286 unique \(fetched\) instructions + 997 \(fetched\) instructions + 361 unique \(fetched\) instructions #else - 733 \(fetched\) instructions - 271 unique \(fetched\) instructions + 958 \(fetched\) instructions + 346 unique \(fetched\) instructions #endif 0 non-fetched instructions 0 prefetches #ifdef __ARM_FEATURE_SVE2 #if (__ARM_FEATURE_SVE_BITS == 128) - 1248 data loads + 1547 data loads 873 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2223 data loads + 2757 data loads 1615 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4206 data loads + 5177 data loads 3099 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #else #if (__ARM_FEATURE_SVE_BITS == 128) - 1227 data loads + 1526 data loads 861 data stores #elif (__ARM_FEATURE_SVE_BITS == 256) - 2199 data loads + 2722 data loads 1595 data stores #elif (__ARM_FEATURE_SVE_BITS == 512) - 4143 data loads + 5114 data loads 3063 data stores #endif /* __ARM_FEATURE_SVE_BITS */ #endif /* __ARM_FEATURE_SVE2 */ diff --git a/clients/drcachesim/tests/scattergather-aarch64.templatex b/clients/drcachesim/tests/scattergather-aarch64.templatex index 021609331f0..9ddf4c58138 100644 --- a/clients/drcachesim/tests/scattergather-aarch64.templatex +++ b/clients/drcachesim/tests/scattergather-aarch64.templatex @@ -58,6 +58,65 @@ ld1d scalar\+vector 32bit unpacked unscaled offset sxtw: PASS ld1d scalar\+vector 64bit scaled offset: PASS ld1d scalar\+vector 64bit unscaled offset: PASS ld1d scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1b scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1b scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1b scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1b scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1b scalar\+vector 64bit unscaled offset: PASS +ldff1b scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sb scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1sb scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1sb scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sb scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sb scalar\+vector 64bit unscaled offset: PASS +ldff1sb scalar\+vector 64bit unscaled offset: PASS +ldff1h scalar\+vector 32bit scaled offset uxtw: PASS +ldff1h scalar\+vector 32bit scaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1h scalar\+vector 64bit scaled offset: PASS +ldff1h scalar\+vector 64bit unscaled offset: PASS +ldff1h scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sh scalar\+vector 32bit scaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit scaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1sh scalar\+vector 64bit scaled offset: PASS +ldff1sh scalar\+vector 64bit unscaled offset: PASS +ldff1sh scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1w scalar\+vector 32bit scaled offset uxtw: PASS +ldff1w scalar\+vector 32bit scaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1w scalar\+vector 64bit scaled offset: PASS +ldff1w scalar\+vector 64bit unscaled offset: PASS +ldff1w scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sw scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1sw scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1sw scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sw scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sw scalar\+vector 64bit scaled offset: PASS +ldff1sw scalar\+vector 64bit unscaled offset: PASS +ldff1sw scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1d scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1d scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1d scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1d scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1d scalar\+vector 64bit scaled offset: PASS +ldff1d scalar\+vector 64bit unscaled offset: PASS +ldff1d scalar\+vector 64bit unscaled offset Zt==Zm: PASS st1b scalar\+vector 32bit unpacked unscaled offset uxtw: PASS st1b scalar\+vector 32bit unpacked unscaled offset sxtw: PASS st1b scalar\+vector 32bit unscaled offset uxtw: PASS @@ -112,6 +171,21 @@ ld1sw vector\+immediate 64bit element \(max index\): PASS ld1d vector\+immediate 64bit element: PASS ld1d vector\+immediate 64bit element \(max index\): PASS ld1d vector\+immediate 64bit element Zt==Zn: PASS +ldff1b vector\+immediate 64bit element: PASS +ldff1b vector\+immediate 64bit element \(max index\): PASS +ldff1sb vector\+immediate 64bit element: PASS +ldff1sb vector\+immediate 64bit element \(max index\): PASS +ldff1h vector\+immediate 64bit element: PASS +ldff1h vector\+immediate 64bit element \(max index\): PASS +ldff1sh vector\+immediate 64bit element: PASS +ldff1sh vector\+immediate 64bit element \(max index\): PASS +ldff1w vector\+immediate 64bit element: PASS +ldff1w vector\+immediate 64bit element \(max index\): PASS +ldff1sw vector\+immediate 64bit element: PASS +ldff1sw vector\+immediate 64bit element \(max index\): PASS +ldff1d vector\+immediate 64bit element: PASS +ldff1d vector\+immediate 64bit element \(max index\): PASS +ldff1d vector\+immediate 64bit element Zt==Zn: PASS st1b vector\+immediate 64bit element: PASS st1b vector\+immediate 64bit element \(max index\): PASS st1b vector\+immediate 64bit element \(repeated base\): PASS @@ -148,6 +222,22 @@ ld1rqb scalar\+scalar: PASS ld1rqh scalar\+scalar: PASS ld1rqw scalar\+scalar: PASS ld1rqd scalar\+scalar: PASS +ldff1b scalar\+scalar 8bit element: PASS +ldff1b scalar\+scalar 16bit element: PASS +ldff1b scalar\+scalar 32bit element: PASS +ldff1b scalar\+scalar 64bit element: PASS +ldff1sb scalar\+scalar 16bit element: PASS +ldff1sb scalar\+scalar 32bit element: PASS +ldff1sb scalar\+scalar 64bit element: PASS +ldff1h scalar\+scalar 16bit element: PASS +ldff1h scalar\+scalar 32bit element: PASS +ldff1h scalar\+scalar 64bit element: PASS +ldff1sh scalar\+scalar 32bit element: PASS +ldff1sh scalar\+scalar 64bit element: PASS +ldff1w scalar\+scalar 32bit element: PASS +ldff1w scalar\+scalar 64bit element: PASS +ldff1sw scalar\+scalar: PASS +ldff1d scalar\+scalar: PASS ld2b scalar\+scalar: PASS ld2h scalar\+scalar: PASS ld2w scalar\+scalar: PASS diff --git a/ext/drx/drx.c b/ext/drx/drx.c index c5a97dd26b5..f5b9d25dd1b 100644 --- a/ext/drx/drx.c +++ b/ext/drx/drx.c @@ -72,7 +72,6 @@ #endif #if defined(X86) || defined(AARCH64) -/* TODO i#5036: Complete AArch64 support. */ # define PLATFORM_SUPPORTS_SCATTER_GATHER #endif diff --git a/ext/drx/scatter_gather_aarch64.c b/ext/drx/scatter_gather_aarch64.c index e3f1960b750..ee3e250015e 100644 --- a/ext/drx/scatter_gather_aarch64.c +++ b/ext/drx/scatter_gather_aarch64.c @@ -1423,16 +1423,6 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e bool res = false; get_scatter_gather_info(sg_instr, &sg_info); - /* Filter out instructions which are not yet supported. - * We return true with *expanded=false here to indicate that no error occurred but - * we didn't expand any instructions. This matches the behaviour of this function - * for architectures with no scatter/gather expansion support. - */ - if (sg_info.faulting_behavior == DRX_FIRST_FAULTING) { - /* TODO i#5036: Add support for first-fault loads. */ - return true; - } - const bool is_contiguous = !(reg_is_z(sg_info.base_reg) || reg_is_z(sg_info.index_reg)); @@ -1623,34 +1613,108 @@ drx_expand_scatter_gather(void *drcontext, instrlist_t *bb, DR_PARAM_OUT bool *e return res; } +/* Used by the signal and state_restore event handlers to detect whether a first-fault + * load faulted on the first active element or not. + */ +static bool +loop_var_is_first_element(const scatter_gather_info_t *sg_info, + const scratch_regs_t *scratch_regs, const dr_mcontext_t *mc) +{ + const uint vl_bytes = proc_get_vector_length_bytes(); + const uint pl_bytes = vl_bytes / 8; + + /* DynamoRIO currently supports vector lengths up to 512-bits which means a + * predicate register size of up to 512/8 = 64-bits. + */ + DR_ASSERT(pl_bytes <= sizeof(uint64)); + + uint64 loop_var = 0; + memcpy(&loop_var, &mc->svep[scratch_regs->pred - DR_REG_P0].u64[0], pl_bytes); + + uint64 governing_predicate = 0; + memcpy(&governing_predicate, &mc->svep[sg_info->mask_reg - DR_REG_P0].u64[0], + pl_bytes); + + /* Make sure we only consider the governing_predicate bits used by the instruction */ + switch (sg_info->element_size) { + case OPSZ_1: break; /* 0b11111111 */ + case OPSZ_2: governing_predicate &= 0x5555555555555555; break; /* 0b01010101 */ + case OPSZ_4: governing_predicate &= 0x1111111111111111; break; /* 0b00010001 */ + case OPSZ_8: governing_predicate &= 0x0101010101010101; break; /* 0b00000001 */ + default: DR_ASSERT_MSG(false, "Scatter/gather instruction has invalid element size"); + } + + /* loop_var should be 1-bit mask indicating the current element */ + DR_ASSERT(loop_var != 0); + DR_ASSERT(TEST(loop_var, governing_predicate)); + + /* If any of the governing_predicate bits lower than the loop_var bit are set, then + * this is not the first active element. + */ + return !TESTANY(loop_var - 1, governing_predicate); +} + dr_signal_action_t drx_scatter_gather_signal_event(void *drcontext, dr_siginfo_t *info, instr_t *sg_inst) { scatter_gather_info_t sg_info; get_scatter_gather_info(sg_inst, &sg_info); + /* allocate_zp_registers() is deterministic so we can call it again here and find out + * which registers are used in the expansion. + */ + spill_slot_state_t spill_slot_state; + init_spill_slot_state(&spill_slot_state); + + scratch_regs_t scratch_regs; + + allocate_zp_registers(sg_inst, &sg_info, &spill_slot_state, &scratch_regs); + if ((info->sig == SIGSEGV || info->sig == SIGBUS) && - sg_info.faulting_behavior == DRX_NON_FAULTING) { - /* The only SVE instructions which have non-faulting behaviour are - * predicated contiguous scalar+immediate loads (ldnf1[bhwd]). - * TODO i#5036: instr_compute_address() does not support vector addressing modes - * (scalar+vector, vector+immediate) which is fine for non-faulting - * loads, but when we add support for first-fault instructions we - * will need to switch this to use instr_compute_address_ex(). - */ - DR_ASSERT(!reg_is_z(sg_info.base_reg)); - DR_ASSERT(!reg_is_z(sg_info.index_reg)); - const app_pc load_min_addr = instr_compute_address(sg_inst, info->mcontext); - const app_pc load_max_addr = - load_min_addr + opnd_size_in_bytes(sg_info.scatter_gather_size); - if (info->access_address < load_min_addr || - info->access_address >= load_max_addr) { - /* The faulting address is out of range for the expanded ldnf instruction so - * the fault must have come from an instruction inserted by a client, rather - * than one of the expansion loads inserted by expand_scatter_gather() so we - * pass the fault on for the client to handle. + sg_info.faulting_behavior != DRX_NORMAL_FAULTING) { + if (reg_is_z(sg_info.base_reg) || reg_is_z(sg_info.index_reg)) { + DR_ASSERT(sg_info.faulting_behavior == DRX_FIRST_FAULTING); + app_pc addr; + uint address_index = 0; + bool found_match = false; + while (!found_match && + instr_compute_address_ex(sg_inst, info->mcontext, address_index, &addr, + NULL)) { + if (addr == info->access_address) + found_match = true; + else + address_index++; + } + if (!found_match || address_index == 0) { + /* For first-fault loads, the fault is not suppressed if the element that + * faults is the first active element + */ + return DR_SIGNAL_DELIVER; + } + } else { + const app_pc load_min_addr = instr_compute_address(sg_inst, info->mcontext); + const app_pc load_max_addr = + load_min_addr + opnd_size_in_bytes(sg_info.scatter_gather_size); + if (info->access_address < load_min_addr || + info->access_address >= load_max_addr) { + /* The faulting address is out of range for the expanded ldnf instruction + * so the fault must have come from an instruction inserted by a client, + * rather than one of the expansion loads inserted by + * expand_scatter_gather() so we pass the fault on for the client to + * handle. + */ + return DR_SIGNAL_DELIVER; + } + + /* First-fault loads behave differently depending on which element faults. + * If the first active element faults then the fault is propagated like a + * normal gather instruction. If any other element faults the fault is + * suppressed and the instruction behaves like a non-faulting load. */ - return DR_SIGNAL_DELIVER; + if (sg_info.faulting_behavior == DRX_FIRST_FAULTING && + loop_var_is_first_element(&sg_info, &scratch_regs, info->raw_mcontext)) { + return DR_SIGNAL_DELIVER; + } } /* Non-faulting loads do not generate a fault when one of the addresses it * accesses faults. Instead it sets the value of the FFR to indicate which @@ -1661,16 +1725,6 @@ drx_scatter_gather_signal_event(void *drcontext, dr_siginfo_t *info, instr_t *sg /* Skip to the next app instruction */ info->mcontext->pc += instr_length(drcontext, sg_inst); - /* allocate_zp_registers() is deterministic so we can call it again here and find - * out which registers are used in the expansion. - */ - spill_slot_state_t spill_slot_state; - init_spill_slot_state(&spill_slot_state); - - scratch_regs_t scratch_regs; - - allocate_zp_registers(sg_inst, &sg_info, &spill_slot_state, &scratch_regs); - /* Set the FFR value * * The FFR is like a special purpose predicate register. When an element access @@ -1781,16 +1835,23 @@ drx_scatter_gather_restore_state(void *drcontext, dr_restore_state_info_t *info, per_thread_t *pt = (per_thread_t *)drmgr_get_tls_field(drcontext, drx_scatter_gather_tls_idx); + /* Faulting element is not the first active element so we need to + * suppress the fault and leave the partial value in the dst reg. + */ + const bool dont_restore_dst = sg_info.faulting_behavior == DRX_FIRST_FAULTING && + !loop_var_is_first_element(&sg_info, &scratch_regs, info->raw_mcontext); + const uint vl_bytes = proc_get_vector_length_bytes(); const uint pl_bytes = vl_bytes / 8; for (uint slot = 0; slot < NUM_VECTOR_SLOTS; slot++) { if (spill_slot_state.vector_slots[slot].kind != SLOT_KIND_UNUSED) { - DR_ASSERT(spill_slot_state.vector_slots[slot].reg >= DR_REG_Z0 && - spill_slot_state.vector_slots[slot].reg <= DR_REG_Z31); + const reg_id_t reg = spill_slot_state.vector_slots[slot].reg; + DR_ASSERT(reg >= DR_REG_Z0 && reg <= DR_REG_Z31); - const size_t reg_num = spill_slot_state.vector_slots[slot].reg - DR_REG_Z0; + if (dont_restore_dst && reg == sg_info.gather_dst_reg) + continue; - memcpy(&info->mcontext->simd[reg_num], + memcpy(&info->mcontext->simd[reg - DR_REG_Z0], &((char *)pt->scratch_vector_spill_slots_aligned)[vl_bytes * slot], vl_bytes); } diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.cpp b/suite/tests/client-interface/drx-scattergather-aarch64.cpp index 828c70d5fe2..b8423495492 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.cpp +++ b/suite/tests/client-interface/drx-scattergather-aarch64.cpp @@ -45,6 +45,8 @@ #include "tools.h" +#define DUMP_UCONTEXT 0 + namespace { /* @@ -562,6 +564,10 @@ static bool signal_handler_called; void signal_handler(int sig, siginfo_t *siginfo, ucontext_t *ucxt) { +#if DUMP_UCONTEXT + dump_ucontext(ucxt, /*is_sve=*/true, get_vl_bytes()); +#endif + signal_handler_called = true; // Skip the faulting instruction ucxt->uc_mcontext.pc += 4; @@ -621,7 +627,8 @@ template struct test_case_base_t { } virtual void - check_fault(bool expected_fault, bool signal_handler_called) + check_fault(predicate_reg_value128_t pred, bool expected_fault, + size_t faulting_element, bool signal_handler_called) { if (!expected_fault && signal_handler_called) { test_failed(); @@ -682,7 +689,7 @@ template struct test_case_base_t { } } - check_fault(expected_fault, signal_handler_called); + check_fault(pred, expected_fault, faulting_element, signal_handler_called); // Validate the output if: // - This is not a fault test (check the expanded instruction behaved @@ -916,6 +923,23 @@ template struct test_case_base_t { } } } + + bool + first_active_element_faults(predicate_reg_value128_t pred, size_t &faulting_element) + { + const auto element_size_bytes = static_cast(element_size_); + const auto num_mask_elements = TEST_VL_BYTES / element_size_bytes; + + while ( + !element_is_active(faulting_element % num_mask_elements, pred, element_size_)) + faulting_element++; + + size_t first_active_element = 0; + while (!element_is_active(first_active_element, pred, element_size_)) + first_active_element++; + + return first_active_element == faulting_element; + } }; struct basic_test_ptrs_t { @@ -1697,149 +1721,138 @@ test_ld1_scalar_plus_vector() # undef TEST_FUNC } -struct scalar_plus_vector_store_test_case_t : public scalar_plus_vector_test_case_base_t { - vector_reg_value128_t offset_data_; - - struct registers_used_t { - unsigned src_z; - unsigned governing_p; - unsigned index_z; - }; - registers_used_t registers_used_; - - element_size_t stored_value_size_; - - bool scaled_; - - expected_values_t expected_values_; +struct scalar_plus_vector_first_fault_load_test_case_t + : public scalar_plus_vector_load_test_case_t { - template - scalar_plus_vector_store_test_case_t( + template + scalar_plus_vector_first_fault_load_test_case_t( std::string name, test_func_t func, registers_used_t registers_used, + std::array reference_data, std::array offsets, - element_size_t stored_value_size, bool scaled) - : scalar_plus_vector_test_case_base_t( - std::move(name), std::move(func), registers_used.governing_p, - static_cast(sizeof(OFFSET_T)), - /*base_ptr=*/OUTPUT_DATA.base_addr()) - , registers_used_(registers_used) - , stored_value_size_(stored_value_size) - , scaled_(scaled) - , expected_values_(offsets, stored_value_size) + element_size_t data_size) + : scalar_plus_vector_load_test_case_t(std::move(name), std::move(func), + registers_used, reference_data, offsets, + data_size) + { - std::memcpy(offset_data_.data(), offsets.data(), offset_data_.size()); } - virtual test_ptrs_t - setup(test_register_data_t ®ister_data, bool force_fault, - size_t faulting_element) override + void + check_fault(predicate_reg_value128_t pred, bool expected_fault, + size_t faulting_element, bool signal_handler_called) override { - // Set the value for the offset register. - register_data.before.set_z_register_value(registers_used_.index_z, offset_data_); - - if (force_fault) { - const size_t offset = test_memory_t::CHUNK_SIZE; - switch (element_size_) { - case element_size_t::SINGLE: - register_data.before.set_z_register_element( - registers_used_.index_z, faulting_element, offset); - break; - case element_size_t::DOUBLE: - register_data.before.set_z_register_element( - registers_used_.index_z, faulting_element, offset); - break; - default: - assert(false && - "scalar+vector instruction should have single or double " - "element size"); - } - } - - register_data.before.set_z_register_value(registers_used_.src_z, - { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, - 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, - 0x12, 0x13, 0x14, 0x15 }); - OUTPUT_DATA.reset(); + expected_fault = + expected_fault && first_active_element_faults(pred, faulting_element); - return { - base_ptr_, - register_data.before.z.data(), - register_data.before.p.data(), - register_data.after.z.data(), - register_data.after.p.data(), - }; + scalar_plus_vector_load_test_case_t::check_fault( + pred, expected_fault, faulting_element, signal_handler_called); } void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, bool expected_fault, size_t faulting_element) override { + if (!expected_fault || first_active_element_faults(pred, faulting_element)) { + // If there is no faulting element, or the first active element faults, then + // this instruction behaves the same as a regular scalar+vector load. + scalar_plus_vector_load_test_case_t::check_output( + pred, register_data, expected_fault, faulting_element); + return; + } + + // Check the FFR value + const auto element_size_bytes = static_cast(element_size_); + const auto num_mask_elements = TEST_VL_BYTES / element_size_bytes; + + const auto original_ffr = register_data.before.get_ffr_value(); + predicate_reg_value128_t ffr_128 = 0; + memcpy(&ffr_128, original_ffr.data, sizeof(ffr_128)); + // All bits from the faulting element onwards are 0 so mask them out. + ffr_128 &= + (1 << ((faulting_element % num_mask_elements) * element_size_bytes)) - 1; + + std::vector expected_ffr_data(original_ffr.size, 0); + memcpy(expected_ffr_data.data(), original_ffr.data, + 2 * ((faulting_element * element_size_bytes) / 16)); + memcpy(&expected_ffr_data[2 * ((faulting_element * element_size_bytes) / 16)], + &ffr_128, sizeof(ffr_128)); + const scalable_reg_value_t expected_ffr { + expected_ffr_data.data(), + expected_ffr_data.size(), + }; + + const auto actual_ffr = register_data.after.get_ffr_value(); + + if (actual_ffr != expected_ffr) { + test_failed(); + print("predicate: "); + print_predicate( + register_data.before.get_p_register_value(registers_used_.governing_p)); + print("\noriginal ffr: "); + print_predicate(register_data.before.get_ffr_value()); + print("\nexpected ffr: "); + print_predicate(expected_ffr); + print("\nactual ffr: "); + print_predicate(actual_ffr); + print("\n"); + } + + // Check destination register value. + if (faulting_element > 0) { + const auto vl_bytes = get_vl_bytes(); + + std::vector expected_output_data; + expected_output_data.resize(vl_bytes); + + assert(reference_data_.size() == TEST_VL_BYTES); + for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { + memcpy(&expected_output_data[TEST_VL_BYTES * i], reference_data_.data(), + TEST_VL_BYTES); + } + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + vl_bytes, + }; + + const auto output_value = + register_data.after.get_z_register_value(registers_used_.dest_z); + + if (memcmp(expected_output.data, output_value.data, faulting_element) != 0) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + } + // Check that the values of the other Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { - check_z_reg(i, register_data); + if (i != registers_used_.dest_z) + check_z_reg(i, register_data); } // Check that the values of the P registers have been preserved. for (size_t i = 0; i < NUM_P_REGS; i++) { check_p_reg(i, register_data); } - check_ffr(register_data); - - if (!expected_fault) { - switch (element_size_) { - case element_size_t::SINGLE: { - std::array base_ptrs { base_ptr_, base_ptr_, base_ptr_, - base_ptr_ }; - switch (stored_value_size_) { - case element_size_t::BYTE: - check_expected_values(expected_values_.u8x4, pred, base_ptrs, - scaled_); - break; - case element_size_t::HALF: - check_expected_values(expected_values_.u16x4, pred, base_ptrs, - scaled_); - break; - case element_size_t::SINGLE: - check_expected_values(expected_values_.u32x4, pred, base_ptrs, - scaled_); - break; - } - } - case element_size_t::DOUBLE: { - std::array base_ptrs { base_ptr_, base_ptr_ }; - switch (stored_value_size_) { - case element_size_t::BYTE: - check_expected_values(expected_values_.u8x2, pred, base_ptrs, - scaled_); - break; - case element_size_t::HALF: - check_expected_values(expected_values_.u16x2, pred, base_ptrs, - scaled_); - break; - case element_size_t::SINGLE: - check_expected_values(expected_values_.u32x2, pred, base_ptrs, - scaled_); - break; - case element_size_t::DOUBLE: - check_expected_values(expected_values_.u64x2, pred, base_ptrs, - scaled_); - break; - } - } - } - } } }; test_result_t -test_st1_scalar_plus_vector() +test_ldff1_scalar_plus_vector() { -# define TEST_FUNC(st_instruction) \ - [](scalar_plus_vector_store_test_case_t::test_ptrs_t &ptrs) { \ +# define TEST_FUNC(ld_instruction) \ + [](scalar_plus_vector_first_fault_load_test_case_t::test_ptrs_t &ptrs) { \ asm(/* clang-format off */ \ RESTORE_FFR(p_restore_base) \ RESTORE_Z_REGISTERS(z_restore_base) \ RESTORE_P_REGISTERS(p_restore_base) \ - st_instruction "\n" \ + ld_instruction "\n" \ SAVE_Z_REGISTERS(z_save_base) \ SAVE_P_REGISTERS(p_save_base) \ SAVE_FFR(p_save_base) /* clang-format on */ \ @@ -1851,379 +1864,556 @@ test_st1_scalar_plus_vector() : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ } - return run_tests({ - /* { - * Test name, - * Function that executes the test instruction, - * Registers used {zt, pg, zm}, - * Offset data (value for zm), - * Stored value size, - * Is the index scaled, - * }, - */ - // ST1B instructions. + return run_tests({ + // LDFF1B instructions. { - "st1b scalar+vector 32bit unpacked unscaled offset uxtw", - TEST_FUNC("st1b z0.d, p0, [%[base], z31.d, uxtw]"), - { /*zt=*/0, /*pg=*/0, /*zm=*/31 }, - std::array { 0, 100 }, + "ldff1b scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("ldff1b z0.s, p0/z, [%[base], z5.s, uxtw]"), + { /*zt=*/0, /*pg=*/0, /*zm=*/5 }, + std::array { 0x14, 0xf2, 0x07, 0x23 }, + std::array { 14, 30, 7, 23 }, element_size_t::BYTE, - /*scaled=*/false, }, { - "st1b scalar+vector 32bit unpacked unscaled offset sxtw", - TEST_FUNC("st1b z1.d, p1, [%[base], z30.d, sxtw]"), - { /*zt=*/1, /*pg=*/1, /*zm=*/30 }, - std::array { -1, 101 }, + "ldff1b scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("ldff1b z1.s, p1/z, [%[base], z6.s, sxtw]"), + { /*zt=*/1, /*pg=*/1, /*zm=*/6 }, + std::array { 0xf7, 0xf2, 0x19, 0xf2 }, + std::array { -7, 30, 19, 30 }, element_size_t::BYTE, - /*scaled=*/false, }, { - "st1b scalar+vector 32bit unscaled offset uxtw", - TEST_FUNC("st1b z2.s, p2, [%[base], z29.s, uxtw]"), - { /*zt=*/2, /*pg=*/2, /*zm=*/29 }, - std::array { 2, 102, 3, 103 }, + "ldff1b scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1b z2.d, p2/z, [%[base], z7.d, uxtw]"), + { /*zt=*/2, /*pg=*/2, /*zm=*/7 }, + std::array { 0x13, 0x14 }, + std::array { 13, 14 }, element_size_t::BYTE, - /*scaled=*/false, }, { - "st1b scalar+vector 32bit unscaled offset sxtw", - TEST_FUNC("st1b z3.s, p3, [%[base], z28.s, sxtw]"), - { /*zt=*/3, /*pg=*/3, /*zm=*/28 }, - std::array { -3, -103, 4, 104 }, + "ldff1b scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1b z3.d, p3/z, [%[base], z8.d, sxtw]"), + { /*zt=*/3, /*pg=*/3, /*zm=*/8 }, + std::array { 0xf6, 0x23 }, + std::array { -6, 23 }, element_size_t::BYTE, - /*scaled=*/false, }, { - "st1b scalar+vector 32bit unscaled offset sxtw (repeated offset)", - TEST_FUNC("st1b z3.s, p3, [%[base], z28.s, sxtw]"), - { /*zt=*/3, /*pg=*/3, /*zm=*/28 }, - std::array { -4, -4, 5, 5 }, + "ldff1b scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1b z4.d, p4/z, [%[base], z9.d]"), + { /*zt=*/4, /*pg=*/4, /*zm=*/9 }, + std::array { 0x15, 0x16 }, + std::array { 15, 16 }, element_size_t::BYTE, - /*scaled=*/false, }, { - "st1b scalar+vector 64bit unscaled offset", - TEST_FUNC("st1b z4.d, p4, [%[base], z27.d]"), - { /*zt=*/4, /*pg=*/4, /*zm=*/27 }, - std::array { 5, 104 }, + "ldff1b scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1b z5.d, p5/z, [%[base], z5.d]"), + { /*zt=*/5, /*pg=*/5, /*zm=*/5 }, + std::array { 0x01, 0x18 }, + std::array { 1, 18 }, element_size_t::BYTE, - /*scaled=*/false, }, + // LDFF1SB instructions. { - "st1b scalar+vector 64bit unscaled offset (repeated offset)", - TEST_FUNC("st1b z4.d, p4, [%[base], z27.d]"), - { /*zt=*/4, /*pg=*/4, /*zm=*/27 }, - std::array { 6, 6 }, + "ldff1sb scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("ldff1sb z6.s, p6/z, [%[base], z10.s, uxtw]"), + { /*zt=*/6, /*pg=*/6, /*zm=*/10 }, + std::array { 0x17, -8, 0x05, 0x16 }, + std::array { 17, 24, 5, 16 }, element_size_t::BYTE, - /*scaled=*/false, }, - // ST1H instructions. { - "st1h scalar+vector 32bit scaled offset uxtw", - TEST_FUNC("st1h z5.s, p5, [%[base], z26.s, uxtw #1]"), - { /*zt=*/5, /*pg=*/5, /*zm=*/26 }, - std::array { 7, 105, 9, 107 }, - element_size_t::HALF, - /*scaled=*/true, + "ldff1sb scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("ldff1sb z7.s, p7/z, [%[base], z11.s, sxtw]"), + { /*zt=*/7, /*pg=*/7, /*zm=*/11 }, + std::array { -14, -13, -12, 0x15 }, + std::array { -2, 29, 28, 15 }, + element_size_t::BYTE, }, { - "st1h scalar+vector 32bit scaled offset sxtw", - TEST_FUNC("st1h z6.s, p6, [%[base], z25.s, sxtw #1]"), - { /*zt=*/6, /*pg=*/6, /*zm=*/25 }, - std::array { -8, -106, 10, 108 }, - element_size_t::HALF, - /*scaled=*/true, + "ldff1sb scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1sb z8.d, p0/z, [%[base], z12.d, uxtw]"), + { /*zt=*/8, /*pg=*/0, /*zm=*/12 }, + std::array { 0x22, -14 }, + std::array { 22, 30 }, + element_size_t::BYTE, }, { - "st1h scalar+vector 32bit unpacked scaled offset uxtw", - TEST_FUNC("st1h z7.d, p7, [%[base], z24.d, uxtw #1]"), - { /*zt=*/7, /*pg=*/7, /*zm=*/24 }, - std::array { 9, 107 }, - element_size_t::HALF, - /*scaled=*/true, + "ldff1sb scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1sb z9.d, p1/z, [%[base], z13.d, sxtw]"), + { /*zt=*/9, /*pg=*/1, /*zm=*/13 }, + std::array { -12, 0x06 }, + std::array { -4, 6 }, + element_size_t::BYTE, }, { - "st1h scalar+vector 32bit unpacked scaled offset sxtw", - TEST_FUNC("st1h z8.d, p0, [%[base], z23.d, sxtw #1]"), - { /*zt=*/8, /*pg=*/0, /*zm=*/23 }, - std::array { -10, 108 }, + "ldff1sb scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1sb z10.d, p2/z, [%[base], z14.d]"), + { /*zt=*/10, /*pg=*/2, /*zm=*/14 }, + std::array { 0x17, 0x04 }, + std::array { 17, 4 }, + element_size_t::BYTE, + }, + { + "ldff1sb scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1sb z11.d, p3/z, [%[base], z15.d]"), + { /*zt=*/11, /*pg=*/3, /*zm=*/15 }, + std::array { 0x15, -14 }, + std::array { 15, 30 }, + element_size_t::BYTE, + }, + // LDFF1H instructions. + { + "ldff1h scalar+vector 32bit scaled offset uxtw", + TEST_FUNC("ldff1h z12.s, p4/z, [%[base], z16.s, uxtw #1]"), + { /*zt=*/12, /*pg=*/4, /*zm=*/16 }, + std::array { 0x0010, 0x0005, 0x0020, 0x0006 }, + std::array { 10, 5, 20, 6 }, element_size_t::HALF, - /*scaled=*/true, }, { - "st1h scalar+vector 32bit unpacked unscaled offset uxtw", - TEST_FUNC("st1h z9.d, p1, [%[base], z22.d, uxtw]"), - { /*zt=*/9, /*pg=*/1, /*zm=*/22 }, - std::array { 11, 109 }, + "ldff1h scalar+vector 32bit scaled offset sxtw", + TEST_FUNC("ldff1h z13.s, p5/z, [%[base], z17.s, sxtw #1]"), + { /*zt=*/13, /*pg=*/5, /*zm=*/17 }, + std::array { 0x0022, 0x0002, 0x0022, 0x0012 }, + std::array { -10, 2, 22, 12 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 32bit unpacked unscaled offset sxtw", - TEST_FUNC("st1h z10.d, p2, [%[base], z21.d, sxtw]"), - { /*zt=*/10, /*pg=*/2, /*zm=*/21 }, - std::array { -12, 110 }, + "ldff1h scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("ldff1h z14.d, p6/z, [%[base], z18.d, uxtw #1]"), + { /*zt=*/14, /*pg=*/6, /*zm=*/18 }, + std::array { 0x0011, 0x0013 }, + std::array { 11, 13 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 32bit unscaled offset uxtw", - TEST_FUNC("st1h z11.s, p3, [%[base], z20.s, uxtw]"), - { /*zt=*/11, /*pg=*/3, /*zm=*/20 }, - std::array { 13, 111, 15, 113 }, + "ldff1h scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("ldff1h z15.d, p7/z, [%[base], z19.d, sxtw #1]"), + { /*zt=*/15, /*pg=*/7, /*zm=*/19 }, + std::array { 0x0023, 0x0021 }, + std::array { -9, 21 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 32bit unscaled offset sxtw", - TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), - { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, - std::array { -14, -112, 16, 114 }, + "ldff1h scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1h z16.d, p0/z, [%[base], z20.d, uxtw]"), + { /*zt=*/16, /*pg=*/0, /*zm=*/20 }, + std::array { 0x00f1, 0x2019 }, + std::array { 31, 19 }, + element_size_t::BYTE, + }, + { + "ldff1h scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1h z17.d, p1/z, [%[base], z21.d, sxtw]"), + { /*zt=*/17, /*pg=*/1, /*zm=*/21 }, + std::array { 0xf5f6, 0xf4f5 }, + std::array { -6, 27 }, + element_size_t::BYTE, + }, + { + "ldff1h scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("ldff1h z18.s, p2/z, [%[base], z22.s, uxtw]"), + { /*zt=*/18, /*pg=*/2, /*zm=*/22 }, + std::array { 0x1716, 0xf4f5, 0x0605, 0x0504 }, + std::array { 16, 27, 5, 4 }, + element_size_t::BYTE, + }, + { + "ldff1h scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("ldff1h z19.s, p3/z, [%[base], z23.s, sxtw]"), + { /*zt=*/19, /*pg=*/3, /*zm=*/23 }, + std::array { 0x2322, 0x1009, 0x1110, 0x0403 }, + std::array { -10, 9, 10, 3 }, + element_size_t::BYTE, + }, + { + "ldff1h scalar+vector 64bit scaled offset", + TEST_FUNC("ldff1h z20.d, p4/z, [%[base], z24.d, lsl #1]"), + { /*zt=*/20, /*pg=*/4, /*zm=*/24 }, + std::array { 0x0014, 0x0009 }, + std::array { 14, 9 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 32bit unscaled offset sxtw", - TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), - { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, - std::array { -14, -112, 16, 114 }, + "ldff1h scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1h z21.d, p5/z, [%[base], z25.d]"), + { /*zt=*/21, /*pg=*/5, /*zm=*/25 }, + std::array { 0x2019, 0x00f1 }, + std::array { 19, 31 }, + element_size_t::BYTE, + }, + { + "ldff1h scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1h z22.d, p6/z, [%[base], z22.d]"), + { /*zt=*/22, /*pg=*/6, /*zm=*/22 }, + std::array { 0xf5f6, 0x1009 }, + std::array { 26, 9 }, + element_size_t::BYTE, + }, + // LDFF1SH instructions. + { + "ldff1sh scalar+vector 32bit scaled offset uxtw", + TEST_FUNC("ldff1sh z23.s, p7/z, [%[base], z26.s, uxtw #1]"), + { /*zt=*/23, /*pg=*/7, /*zm=*/26 }, + std::array { 0x0005, 0x0009, 0x0010, 0x0010 }, + std::array { 5, 9, 10, 10 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 32bit unscaled offset sxtw (repeated offset)", - TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), - { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, - std::array { 15, 15, 17, 17 }, + "ldff1sh scalar+vector 32bit scaled offset sxtw", + TEST_FUNC("ldff1sh z24.s, p0/z, [%[base], z27.s, sxtw #1]"), + { /*zt=*/24, /*pg=*/0, /*zm=*/27 }, + std::array { 0x0023, -9, -8, -15 }, + std::array { -9, 25, 24, 31 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 64bit scaled offset", - TEST_FUNC("st1h z13.d, p5, [%[base], z18.d, lsl #1]"), - { /*zt=*/13, /*pg=*/5, /*zm=*/18 }, - std::array { 16, 113 }, + "ldff1sh scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("ldff1sh z25.d, p1/z, [%[base], z28.d, uxtw #1]"), + { /*zt=*/25, /*pg=*/1, /*zm=*/28 }, + std::array { 0x0005, 0x0019 }, + std::array { 5, 19 }, element_size_t::HALF, - /*scaled=*/true, }, { - "st1h scalar+vector 64bit unscaled offset", - TEST_FUNC("st1h z14.d, p6, [%[base], z17.d]"), - { /*zt=*/14, /*pg=*/6, /*zm=*/17 }, - std::array { 17, 114 }, + "ldff1sh scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("ldff1sh z26.d, p2/z, [%[base], z29.d, sxtw #1]"), + { /*zt=*/26, /*pg=*/2, /*zm=*/29 }, + std::array { -11, -14 }, + std::array { -5, 30 }, element_size_t::HALF, - /*scaled=*/false, }, { - "st1h scalar+vector 64bit unscaled offset (repeated offset)", - TEST_FUNC("st1h z14.d, p6, [%[base], z17.d]"), - { /*zt=*/14, /*pg=*/6, /*zm=*/17 }, - std::array { 18, 18 }, + "ldff1sh scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1sh z27.d, p3/z, [%[base], z30.d, uxtw]"), + { /*zt=*/27, /*pg=*/3, /*zm=*/30 }, + std::array { 0x1211, 0x1312 }, + std::array { 11, 12 }, + element_size_t::BYTE, + }, + { + "ldff1sh scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1sh z28.d, p4/z, [%[base], z31.d, sxtw]"), + { /*zt=*/28, /*pg=*/4, /*zm=*/31 }, + std::array { -2313, 0x1413 }, + std::array { -7, 13 }, + element_size_t::BYTE, + }, + { + "ldff1sh scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("ldff1sh z29.s, p5/z, [%[base], z29.s, uxtw]"), + { /*zt=*/29, /*pg=*/5, /*zm=*/29 }, + std::array { 0x1312, 0x2322, -2313, 0x0807 }, + std::array { 12, 22, 25, 7 }, + element_size_t::BYTE, + }, + { + "ldff1sh scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("ldff1sh z30.s, p6/z, [%[base], z1.s, sxtw]"), + { /*zt=*/30, /*pg=*/6, /*zm=*/1 }, + std::array { -2313, 0x0201, 0x0807, 0x0908 }, + std::array { -7, 1, 7, 8 }, + element_size_t::BYTE, + }, + { + "ldff1sh scalar+vector 64bit scaled offset", + TEST_FUNC("ldff1sh z31.d, p7/z, [%[base], z2.d, lsl #1]"), + { /*zt=*/31, /*pg=*/7, /*zm=*/2 }, + std::array { -10, -14 }, + std::array { 26, 30 }, element_size_t::HALF, - /*scaled=*/false, }, - // ST1W instructions. { - "st1w scalar+vector 32bit scaled offset uxtw", - TEST_FUNC("st1w z15.s, p7, [%[base], z16.s, uxtw #2]"), - { /*zt=*/15, /*pg=*/7, /*zm=*/16 }, - std::array { 19, 115, 23, 119 }, - element_size_t::SINGLE, - /*scaled=*/true, + "ldff1sh scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1sh z0.d, p0/z, [%[base], z3.d]"), + { /*zt=*/0, /*pg=*/0, /*zm=*/3 }, + std::array { 0x0706, 0x0504 }, + std::array { 6, 4 }, + element_size_t::BYTE, }, { - "st1w scalar+vector 32bit scaled offset sxtw", - TEST_FUNC("st1w z16.s, p0, [%[base], z15.s, sxtw #2]"), - { /*zt=*/16, /*pg=*/0, /*zm=*/15 }, - std::array { -20, -116, 24, 120 }, - element_size_t::SINGLE, - /*scaled=*/true, + "ldff1sh scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1sh z1.d, p1/z, [%[base], z1.d]"), + { /*zt=*/1, /*pg=*/1, /*zm=*/1 }, + std::array { 0x0605, 0x2120 }, + std::array { 5, 20 }, + element_size_t::BYTE, }, + // LDFF1W instructions. { - "st1w scalar+vector 32bit unpacked scaled offset uxtw", - TEST_FUNC("st1w z17.d, p1, [%[base], z14.d, uxtw #2]"), - { /*zt=*/17, /*pg=*/1, /*zm=*/14 }, - std::array { 21, 117 }, + "ldff1w scalar+vector 32bit scaled offset uxtw", + TEST_FUNC("ldff1w z2.s, p2/z, [%[base], z4.s, uxtw #2]"), + { /*zt=*/2, /*pg=*/2, /*zm=*/4 }, + std::array { 0x00000005, 0x00000002, 0x00000020, 0x00000000 }, + std::array { 5, 2, 20, 0 }, element_size_t::SINGLE, - /*scaled=*/true, }, { - "st1w scalar+vector 32bit unpacked scaled offset sxtw", - TEST_FUNC("st1w z18.d, p2, [%[base], z13.d, sxtw #2]"), - { /*zt=*/18, /*pg=*/2, /*zm=*/13 }, - std::array { -22, 118 }, + "ldff1w scalar+vector 32bit scaled offset sxtw", + TEST_FUNC("ldff1w z3.s, p3/z, [%[base], z5.s, sxtw #2]"), + { /*zt=*/3, /*pg=*/3, /*zm=*/5 }, + std::array { 0x00000021, 0x00000007, 0x00000023, 0x00000017 }, + std::array { -11, 7, 23, 17 }, element_size_t::SINGLE, - /*scaled=*/true, }, { - "st1w scalar+vector 32bit unpacked unscaled offset uxtw", - TEST_FUNC("st1w z19.d, p3, [%[base], z12.d, uxtw]"), - { /*zt=*/19, /*pg=*/3, /*zm=*/12 }, - std::array { 23, 119 }, + "ldff1w scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("ldff1w z4.d, p4/z, [%[base], z6.d, uxtw #2]"), + { /*zt=*/4, /*pg=*/4, /*zm=*/6 }, + std::array { 0x00000003, 0x00000023 }, + std::array { 3, 23 }, element_size_t::SINGLE, - /*scaled=*/false, }, { - "st1w scalar+vector 32bit unpacked unscaled offset sxtw", - TEST_FUNC("st1w z20.d, p4, [%[base], z11.d, sxtw]"), - { /*zt=*/20, /*pg=*/4, /*zm=*/11 }, - std::array { -24, 120 }, + "ldff1w scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("ldff1w z5.d, p5/z, [%[base], z7.d, sxtw #2]"), + { /*zt=*/5, /*pg=*/5, /*zm=*/7 }, + std::array { 0x00000021, 0xfffffff7 }, + std::array { -11, 25 }, element_size_t::SINGLE, - /*scaled=*/false, }, { - "st1w scalar+vector 32bit unscaled offset uxtw", - TEST_FUNC("st1w z21.s, p5, [%[base], z10.s, uxtw]"), - { /*zt=*/21, /*pg=*/5, /*zm=*/10 }, - std::array { 25, 121, 29, 125 }, - element_size_t::SINGLE, - /*scaled=*/false, + "ldff1w scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1w z6.d, p6/z, [%[base], z8.d, uxtw]"), + { /*zt=*/6, /*pg=*/6, /*zm=*/8 }, + std::array { 0x13121110, 0xf7f82322 }, + std::array { 10, 22 }, + element_size_t::BYTE, }, { - "st1w scalar+vector 32bit unscaled offset sxtw", - TEST_FUNC("st1w z22.s, p6, [%[base], z9.s, sxtw]"), - { /*zt=*/22, /*pg=*/6, /*zm=*/9 }, - std::array { -26, -122, 30, 126 }, - element_size_t::SINGLE, - /*scaled=*/false, + "ldff1w scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1w z7.d, p7/z, [%[base], z9.d, sxtw]"), + { /*zt=*/7, /*pg=*/7, /*zm=*/9 }, + std::array { 0xf6f7f823, 0x13121110 }, + std::array { -9, 10 }, + element_size_t::BYTE, }, { - "st1w scalar+vector 32bit unscaled offset sxtw (repeated offset)", - TEST_FUNC("st1w z22.s, p6, [%[base], z9.s, sxtw]"), - { /*zt=*/22, /*pg=*/6, /*zm=*/9 }, - std::array { -27, -27, 30, 30 }, + "ldff1w scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("ldff1w z8.s, p0/z, [%[base], z10.s, uxtw]"), + { /*zt=*/8, /*pg=*/0, /*zm=*/10 }, + std::array { 0x020100f1, 0x07060504, 0xf2f3f4f5, 0x19181716 }, + std::array { 31, 4, 27, 16 }, + element_size_t::BYTE, + }, + { + "ldff1w scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("ldff1w z9.s, p1/z, [%[base], z11.s, sxtw]"), + { /*zt=*/9, /*pg=*/1, /*zm=*/11 }, + std::array { 0xf4f5f6f7, 0x03020100, 0xf1f2f3f4, 0xf2f3f4f5 }, + std::array { -7, 0, 28, 27 }, + element_size_t::BYTE, + }, + { + "ldff1w scalar+vector 64bit scaled offset", + TEST_FUNC("ldff1w z10.d, p2/z, [%[base], z12.d, lsl #2]"), + { /*zt=*/10, /*pg=*/2, /*zm=*/12 }, + std::array { 0xfffffff5, 0x00000002 }, + std::array { 27, 2 }, element_size_t::SINGLE, - /*scaled=*/false, }, { - "st1w scalar+vector 64bit scaled offset", - TEST_FUNC("st1w z23.d, p7, [%[base], z8.d, lsl #2]"), - { /*zt=*/23, /*pg=*/7, /*zm=*/8 }, - std::array { 28, 123 }, + "ldff1w scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1w z11.d, p3/z, [%[base], z13.d]"), + { /*zt=*/11, /*pg=*/3, /*zm=*/13 }, + std::array { 0x11100908, 0x23222120 }, + std::array { 8, 20 }, + element_size_t::BYTE, + }, + { + "ldff1w scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1w z12.d, p4/z, [%[base], z12.d]"), + { /*zt=*/12, /*pg=*/4, /*zm=*/12 }, + std::array { 0x06050403, 0x07060504 }, + std::array { 3, 4 }, + element_size_t::BYTE, + }, + // LDFF1SW instructions. + { + "ldff1sw scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("ldff1sw z13.d, p5/z, [%[base], z14.d, uxtw #2]"), + { /*zt=*/13, /*pg=*/5, /*zm=*/14 }, + std::array { 0x00000017, 0x00000015 }, + std::array { 17, 15 }, element_size_t::SINGLE, - /*scaled=*/true, }, { - "st1w scalar+vector 64bit unscaled offset", - TEST_FUNC("st1w z24.d, p0, [%[base], z7.d]"), - { /*zt=*/24, /*pg=*/0, /*zm=*/7 }, - std::array { 29, 124 }, + "ldff1sw scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("ldff1sw z14.d, p6/z, [%[base], z15.d, sxtw #2]"), + { /*zt=*/14, /*pg=*/6, /*zm=*/15 }, + std::array { 0x00000023, 0x00000013 }, + std::array { -9, 13 }, element_size_t::SINGLE, - /*scaled=*/false, }, { - "st1w scalar+vector 64bit unscaled offset (repeated offset)", - TEST_FUNC("st1w z24.d, p0, [%[base], z7.d]"), - { /*zt=*/24, /*pg=*/0, /*zm=*/7 }, - std::array { 30, 30 }, + "ldff1sw scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1sw z15.d, p7/z, [%[base], z16.d, uxtw]"), + { /*zt=*/15, /*pg=*/7, /*zm=*/16 }, + std::array { -185207049, 0x07060504 }, + std::array { 25, 4 }, + element_size_t::BYTE, + }, + { + "ldff1sw scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1sw z16.d, p0/z, [%[base], z17.d, sxtw]"), + { /*zt=*/16, /*pg=*/0, /*zm=*/17 }, + std::array { -151521245, -218893067 }, + std::array { -9, 27 }, + element_size_t::BYTE, + }, + { + "ldff1sw scalar+vector 64bit scaled offset", + TEST_FUNC("ldff1sw z17.d, p1/z, [%[base], z18.d, lsl #2]"), + { /*zt=*/17, /*pg=*/1, /*zm=*/18 }, + std::array { -9, -10 }, + std::array { 25, 26 }, element_size_t::SINGLE, - /*scaled=*/false, }, - // ST1D instructions. { - "st1d scalar+vector 32bit unpacked scaled offset uxtw", - TEST_FUNC("st1d z25.d, p1, [%[base], z6.d, uxtw #3]"), - { /*zt=*/25, /*pg=*/1, /*zm=*/6 }, - std::array { 31, 125 }, - element_size_t::DOUBLE, - /*scaled=*/true, + "ldff1sw scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1sw z18.d, p2/z, [%[base], z19.d]"), + { /*zt=*/18, /*pg=*/2, /*zm=*/19 }, + std::array { 0x06050403, 0x15141312 }, + std::array { 3, 12 }, + element_size_t::BYTE, }, { - "st1d scalar+vector 32bit unpacked scaled offset sxtw", - TEST_FUNC("st1d z26.d, p2, [%[base], z5.d, sxtw #3]"), - { /*zt=*/26, /*pg=*/2, /*zm=*/5 }, - std::array { -32, 126 }, - element_size_t::DOUBLE, - /*scaled=*/true, + "ldff1sw scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1sw z19.d, p3/z, [%[base], z19.d]"), + { /*zt=*/19, /*pg=*/3, /*zm=*/19 }, + std::array { -151521245, 0x13121110 }, + std::array { 23, 10 }, + element_size_t::BYTE, }, + // LDFF1D instructions. { - "st1d scalar+vector 32bit unpacked unscaled offset uxtw", - TEST_FUNC("st1d z27.d, p3, [%[base], z4.d, uxtw]"), - { /*zt=*/27, /*pg=*/3, /*zm=*/4 }, - std::array { 33, 127 }, + "ldff1d scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("ldff1d z20.d, p4/z, [%[base], z20.d, uxtw #3]"), + { /*zt=*/20, /*pg=*/4, /*zm=*/20 }, + std::array { 0xfffffffffffffff4, 0x0000000000000008 }, + std::array { 28, 8 }, element_size_t::DOUBLE, - /*scaled=*/false, }, { - "st1d scalar+vector 32bit unpacked unscaled offset sxtw", - TEST_FUNC("st1d z28.d, p4, [%[base], z3.d, sxtw]"), - { /*zt=*/28, /*pg=*/4, /*zm=*/3 }, - std::array { -34, 128 }, + "ldff1d scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("ldff1d z21.d, p5/z, [%[base], z21.d, sxtw #3]"), + { /*zt=*/21, /*pg=*/5, /*zm=*/21 }, + std::array { 0x0000000000000019, 0x0000000000000011 }, + std::array { -13, 11 }, element_size_t::DOUBLE, - /*scaled=*/false, }, { - "st1d scalar+vector 64bit scaled offset", - TEST_FUNC("st1d z29.d, p5, [%[base], z2.d, lsl #3]"), - { /*zt=*/29, /*pg=*/5, /*zm=*/2 }, - std::array { 36, 129 }, - element_size_t::DOUBLE, - /*scaled=*/true, + "ldff1d scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("ldff1d z22.d, p6/z, [%[base], z22.d, uxtw]"), + { /*zt=*/22, /*pg=*/6, /*zm=*/22 }, + std::array { 0x2019181716151413, 0x2322212019181716 }, + std::array { 13, 16 }, + element_size_t::BYTE, }, { - "st1d scalar+vector 64bit unscaled offset", - TEST_FUNC("st1d z30.d, p6, [%[base], z1.d]"), - { /*zt=*/30, /*pg=*/6, /*zm=*/1 }, - std::array { 37, 130 }, - element_size_t::DOUBLE, - /*scaled=*/false, + "ldff1d scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("ldff1d z23.d, p7/z, [%[base], z23.d, sxtw]"), + { /*zt=*/23, /*pg=*/7, /*zm=*/23 }, + std::array { 0xf7f8232221201918, 0x2322212019181716 }, + std::array { -14, 16 }, + element_size_t::BYTE, }, { - "st1d scalar+vector 64bit unscaled offset (repeated offset)", - TEST_FUNC("st1d z30.d, p6, [%[base], z1.d]"), - { /*zt=*/30, /*pg=*/6, /*zm=*/1 }, - std::array { 38, 38 }, + "ldff1d scalar+vector 64bit scaled offset", + TEST_FUNC("ldff1d z24.d, p0/z, [%[base], z24.d, lsl #3]"), + { /*zt=*/24, /*pg=*/0, /*zm=*/24 }, + std::array { 0x0000000000000003, 0x0000000000000016 }, + std::array { 3, 16 }, element_size_t::DOUBLE, - /*scaled=*/false, + }, + { + "ldff1d scalar+vector 64bit unscaled offset", + TEST_FUNC("ldff1d z25.d, p1/z, [%[base], z25.d]"), + { /*zt=*/25, /*pg=*/1, /*zm=*/25 }, + std::array { 0x1312111009080706, 0x2221201918171615 }, + std::array { 6, 15 }, + element_size_t::BYTE, + }, + { + "ldff1d scalar+vector 64bit unscaled offset Zt==Zm", + TEST_FUNC("ldff1d z26.d, p2/z, [%[base], z26.d]"), + { /*zt=*/26, /*pg=*/2, /*zm=*/26 }, + std::array { 0x00f1f2f3f4f5f6f7, 0x1211100908070605 }, + std::array { 25, 5 }, + element_size_t::BYTE, }, }); # undef TEST_FUNC } -struct vector_plus_immediate_load_test_case_t - : public test_case_base_t { - vector_reg_value128_t reference_data_; - vector_reg_value128_t base_data_; +struct scalar_plus_vector_store_test_case_t : public scalar_plus_vector_test_case_base_t { + vector_reg_value128_t offset_data_; struct registers_used_t { - unsigned dest_z; + unsigned src_z; unsigned governing_p; - unsigned base_z; - } registers_used_; + unsigned index_z; + }; + registers_used_t registers_used_; - template - vector_plus_immediate_load_test_case_t( + element_size_t stored_value_size_; + + bool scaled_; + + expected_values_t expected_values_; + + template + scalar_plus_vector_store_test_case_t( std::string name, test_func_t func, registers_used_t registers_used, - std::array reference_data, - std::array base) - : test_case_base_t( + std::array offsets, + element_size_t stored_value_size, bool scaled) + : scalar_plus_vector_test_case_base_t( std::move(name), std::move(func), registers_used.governing_p, - static_cast(sizeof(BASE_T)), SCATTER_GATHER_INSTRUCTION) + static_cast(sizeof(OFFSET_T)), + /*base_ptr=*/OUTPUT_DATA.base_addr()) , registers_used_(registers_used) - + , stored_value_size_(stored_value_size) + , scaled_(scaled) + , expected_values_(offsets, stored_value_size) { - std::memcpy(reference_data_.data(), reference_data.data(), - reference_data_.size()); - std::memcpy(base_data_.data(), base.data(), base_data_.size()); + std::memcpy(offset_data_.data(), offsets.data(), offset_data_.size()); } virtual test_ptrs_t setup(test_register_data_t ®ister_data, bool force_fault, size_t faulting_element) override { - // Set the value for the base vector register. - register_data.before.set_z_register_value(registers_used_.base_z, base_data_); + // Set the value for the offset register. + register_data.before.set_z_register_value(registers_used_.index_z, offset_data_); if (force_fault) { - assert(element_size_ == element_size_t::DOUBLE); - - register_data.before.set_z_register_element(registers_used_.base_z, - faulting_element, - INPUT_DATA.faulting_base_addr(0)); + const size_t offset = test_memory_t::CHUNK_SIZE; + switch (element_size_) { + case element_size_t::SINGLE: + register_data.before.set_z_register_element( + registers_used_.index_z, faulting_element, offset); + break; + case element_size_t::DOUBLE: + register_data.before.set_z_register_element( + registers_used_.index_z, faulting_element, offset); + break; + default: + assert(false && + "scalar+vector instruction should have single or double " + "element size"); + } } + register_data.before.set_z_register_value(registers_used_.src_z, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15 }); + OUTPUT_DATA.reset(); + return { + base_ptr_, register_data.before.z.data(), register_data.before.p.data(), register_data.after.z.data(), @@ -2235,43 +2425,8 @@ struct vector_plus_immediate_load_test_case_t check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, bool expected_fault, size_t faulting_element) override { - const auto vl_bytes = get_vl_bytes(); - - if (!expected_fault) { - std::vector expected_output_data; - expected_output_data.resize(vl_bytes); - - assert(reference_data_.size() == TEST_VL_BYTES); - for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { - memcpy(&expected_output_data[TEST_VL_BYTES * i], reference_data_.data(), - TEST_VL_BYTES); - } - apply_predicate_mask(expected_output_data, pred, element_size_); - const scalable_reg_value_t expected_output { - expected_output_data.data(), - vl_bytes, - }; - - const auto output_value = - register_data.after.get_z_register_value(registers_used_.dest_z); - - if (output_value != expected_output) { - test_failed(); - print("predicate: "); - print_predicate(register_data.before.get_p_register_value( - registers_used_.governing_p)); - print("\nexpected: "); - print_vector(expected_output); - print("\nactual: "); - print_vector(output_value); - print("\n"); - } - } - // Check that the values of the other Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { - if (i == registers_used_.dest_z && !expected_fault) - continue; check_z_reg(i, register_data); } // Check that the values of the P registers have been preserved. @@ -2279,103 +2434,910 @@ struct vector_plus_immediate_load_test_case_t check_p_reg(i, register_data); } check_ffr(register_data); + + if (!expected_fault) { + switch (element_size_) { + case element_size_t::SINGLE: { + std::array base_ptrs { base_ptr_, base_ptr_, base_ptr_, + base_ptr_ }; + switch (stored_value_size_) { + case element_size_t::BYTE: + check_expected_values(expected_values_.u8x4, pred, base_ptrs, + scaled_); + break; + case element_size_t::HALF: + check_expected_values(expected_values_.u16x4, pred, base_ptrs, + scaled_); + break; + case element_size_t::SINGLE: + check_expected_values(expected_values_.u32x4, pred, base_ptrs, + scaled_); + break; + } + } + case element_size_t::DOUBLE: { + std::array base_ptrs { base_ptr_, base_ptr_ }; + switch (stored_value_size_) { + case element_size_t::BYTE: + check_expected_values(expected_values_.u8x2, pred, base_ptrs, + scaled_); + break; + case element_size_t::HALF: + check_expected_values(expected_values_.u16x2, pred, base_ptrs, + scaled_); + break; + case element_size_t::SINGLE: + check_expected_values(expected_values_.u32x2, pred, base_ptrs, + scaled_); + break; + case element_size_t::DOUBLE: + check_expected_values(expected_values_.u64x2, pred, base_ptrs, + scaled_); + break; + } + } + } + } } }; test_result_t -test_ld1_vector_plus_immediate() +test_st1_scalar_plus_vector() { -# define TEST_FUNC(ld_instruction) \ - [](vector_plus_immediate_load_test_case_t::test_ptrs_t &ptrs) { \ - asm(/* clang-format off */ \ - RESTORE_FFR(p_restore_base) \ - RESTORE_Z_REGISTERS(z_restore_base) \ - RESTORE_P_REGISTERS(p_restore_base) \ - ld_instruction "\n" \ - SAVE_Z_REGISTERS(z_save_base) \ - SAVE_P_REGISTERS(p_save_base) \ - SAVE_FFR(p_save_base) /* clang-format on */ \ - : \ - : [z_restore_base] "r"(ptrs.z_restore_base), \ - [z_save_base] "r"(ptrs.z_save_base), \ - [p_restore_base] "r"(ptrs.p_restore_base), \ - [p_save_base] "r"(ptrs.p_save_base) \ - : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ +# define TEST_FUNC(st_instruction) \ + [](scalar_plus_vector_store_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_FFR(p_restore_base) \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + st_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [base] "r"(ptrs.base), [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ } - const auto get_base_ptr = [&](element_size_t element_size, size_t offset) { - void *start = INPUT_DATA.base_addr_for_data_size(element_size); - switch (element_size) { - case element_size_t::BYTE: - return reinterpret_cast(&static_cast(start)[offset]); - case element_size_t::HALF: - return reinterpret_cast(&static_cast(start)[offset]); - case element_size_t::SINGLE: - return reinterpret_cast(&static_cast(start)[offset]); - case element_size_t::DOUBLE: - return reinterpret_cast(&static_cast(start)[offset]); - } - assert(false); // unreachable - return uintptr_t(0); - }; - return run_tests({ + return run_tests({ /* { * Test name, * Function that executes the test instruction, - * Registers used {zt, pg, zn}, - * Expected output data, - * Base data (value for zn), + * Registers used {zt, pg, zm}, + * Offset data (value for zm), + * Stored value size, + * Is the index scaled, * }, */ - /* TODO i#5036: Add tests for 32-bit element variants. - * For example: ld1b z0.s, p0/z, [z31.s, #0]. - * These instructions require 32-bit base pointers and I'm not sure - * how we can reliably and portably guarantee that allocated memory - * has an address that fits into 32-bits. - */ + // ST1B instructions. { - "ld1b vector+immediate 64bit element", - TEST_FUNC("ld1b z0.d, p0/z, [z31.d, #0]"), - { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, - std::array { 0x00, 0x16 }, - std::array { - get_base_ptr(element_size_t::BYTE, 0), - get_base_ptr(element_size_t::BYTE, 16), - }, + "st1b scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("st1b z0.d, p0, [%[base], z31.d, uxtw]"), + { /*zt=*/0, /*pg=*/0, /*zm=*/31 }, + std::array { 0, 100 }, + element_size_t::BYTE, + /*scaled=*/false, }, { - "ld1b vector+immediate 64bit element (max index)", - TEST_FUNC("ld1b z0.d, p0/z, [z31.d, #31]"), - { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, - std::array { 0xf1, 0xf1 }, - std::array { - get_base_ptr(element_size_t::BYTE, 0), - get_base_ptr(element_size_t::BYTE, 0), - }, + "st1b scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("st1b z1.d, p1, [%[base], z30.d, sxtw]"), + { /*zt=*/1, /*pg=*/1, /*zm=*/30 }, + std::array { -1, 101 }, + element_size_t::BYTE, + /*scaled=*/false, }, { - "ld1sb vector+immediate 64bit element", - TEST_FUNC("ld1sb z3.d, p1/z, [z27.d, #1]"), - { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, - std::array { 0x02, -15 }, - std::array { - get_base_ptr(element_size_t::BYTE, 1), - get_base_ptr(element_size_t::BYTE, 30), - }, + "st1b scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("st1b z2.s, p2, [%[base], z29.s, uxtw]"), + { /*zt=*/2, /*pg=*/2, /*zm=*/29 }, + std::array { 2, 102, 3, 103 }, + element_size_t::BYTE, + /*scaled=*/false, }, { - "ld1sb vector+immediate 64bit element (max index)", - TEST_FUNC("ld1sb z3.d, p1/z, [z27.d, #31]"), - { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, - std::array { -15, -15 }, - std::array { + "st1b scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("st1b z3.s, p3, [%[base], z28.s, sxtw]"), + { /*zt=*/3, /*pg=*/3, /*zm=*/28 }, + std::array { -3, -103, 4, 104 }, + element_size_t::BYTE, + /*scaled=*/false, + }, + { + "st1b scalar+vector 32bit unscaled offset sxtw (repeated offset)", + TEST_FUNC("st1b z3.s, p3, [%[base], z28.s, sxtw]"), + { /*zt=*/3, /*pg=*/3, /*zm=*/28 }, + std::array { -4, -4, 5, 5 }, + element_size_t::BYTE, + /*scaled=*/false, + }, + { + "st1b scalar+vector 64bit unscaled offset", + TEST_FUNC("st1b z4.d, p4, [%[base], z27.d]"), + { /*zt=*/4, /*pg=*/4, /*zm=*/27 }, + std::array { 5, 104 }, + element_size_t::BYTE, + /*scaled=*/false, + }, + { + "st1b scalar+vector 64bit unscaled offset (repeated offset)", + TEST_FUNC("st1b z4.d, p4, [%[base], z27.d]"), + { /*zt=*/4, /*pg=*/4, /*zm=*/27 }, + std::array { 6, 6 }, + element_size_t::BYTE, + /*scaled=*/false, + }, + // ST1H instructions. + { + "st1h scalar+vector 32bit scaled offset uxtw", + TEST_FUNC("st1h z5.s, p5, [%[base], z26.s, uxtw #1]"), + { /*zt=*/5, /*pg=*/5, /*zm=*/26 }, + std::array { 7, 105, 9, 107 }, + element_size_t::HALF, + /*scaled=*/true, + }, + { + "st1h scalar+vector 32bit scaled offset sxtw", + TEST_FUNC("st1h z6.s, p6, [%[base], z25.s, sxtw #1]"), + { /*zt=*/6, /*pg=*/6, /*zm=*/25 }, + std::array { -8, -106, 10, 108 }, + element_size_t::HALF, + /*scaled=*/true, + }, + { + "st1h scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("st1h z7.d, p7, [%[base], z24.d, uxtw #1]"), + { /*zt=*/7, /*pg=*/7, /*zm=*/24 }, + std::array { 9, 107 }, + element_size_t::HALF, + /*scaled=*/true, + }, + { + "st1h scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("st1h z8.d, p0, [%[base], z23.d, sxtw #1]"), + { /*zt=*/8, /*pg=*/0, /*zm=*/23 }, + std::array { -10, 108 }, + element_size_t::HALF, + /*scaled=*/true, + }, + { + "st1h scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("st1h z9.d, p1, [%[base], z22.d, uxtw]"), + { /*zt=*/9, /*pg=*/1, /*zm=*/22 }, + std::array { 11, 109 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("st1h z10.d, p2, [%[base], z21.d, sxtw]"), + { /*zt=*/10, /*pg=*/2, /*zm=*/21 }, + std::array { -12, 110 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("st1h z11.s, p3, [%[base], z20.s, uxtw]"), + { /*zt=*/11, /*pg=*/3, /*zm=*/20 }, + std::array { 13, 111, 15, 113 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), + { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, + std::array { -14, -112, 16, 114 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), + { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, + std::array { -14, -112, 16, 114 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 32bit unscaled offset sxtw (repeated offset)", + TEST_FUNC("st1h z12.s, p4, [%[base], z19.s, sxtw]"), + { /*zt=*/12, /*pg=*/4, /*zm=*/19 }, + std::array { 15, 15, 17, 17 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 64bit scaled offset", + TEST_FUNC("st1h z13.d, p5, [%[base], z18.d, lsl #1]"), + { /*zt=*/13, /*pg=*/5, /*zm=*/18 }, + std::array { 16, 113 }, + element_size_t::HALF, + /*scaled=*/true, + }, + { + "st1h scalar+vector 64bit unscaled offset", + TEST_FUNC("st1h z14.d, p6, [%[base], z17.d]"), + { /*zt=*/14, /*pg=*/6, /*zm=*/17 }, + std::array { 17, 114 }, + element_size_t::HALF, + /*scaled=*/false, + }, + { + "st1h scalar+vector 64bit unscaled offset (repeated offset)", + TEST_FUNC("st1h z14.d, p6, [%[base], z17.d]"), + { /*zt=*/14, /*pg=*/6, /*zm=*/17 }, + std::array { 18, 18 }, + element_size_t::HALF, + /*scaled=*/false, + }, + // ST1W instructions. + { + "st1w scalar+vector 32bit scaled offset uxtw", + TEST_FUNC("st1w z15.s, p7, [%[base], z16.s, uxtw #2]"), + { /*zt=*/15, /*pg=*/7, /*zm=*/16 }, + std::array { 19, 115, 23, 119 }, + element_size_t::SINGLE, + /*scaled=*/true, + }, + { + "st1w scalar+vector 32bit scaled offset sxtw", + TEST_FUNC("st1w z16.s, p0, [%[base], z15.s, sxtw #2]"), + { /*zt=*/16, /*pg=*/0, /*zm=*/15 }, + std::array { -20, -116, 24, 120 }, + element_size_t::SINGLE, + /*scaled=*/true, + }, + { + "st1w scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("st1w z17.d, p1, [%[base], z14.d, uxtw #2]"), + { /*zt=*/17, /*pg=*/1, /*zm=*/14 }, + std::array { 21, 117 }, + element_size_t::SINGLE, + /*scaled=*/true, + }, + { + "st1w scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("st1w z18.d, p2, [%[base], z13.d, sxtw #2]"), + { /*zt=*/18, /*pg=*/2, /*zm=*/13 }, + std::array { -22, 118 }, + element_size_t::SINGLE, + /*scaled=*/true, + }, + { + "st1w scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("st1w z19.d, p3, [%[base], z12.d, uxtw]"), + { /*zt=*/19, /*pg=*/3, /*zm=*/12 }, + std::array { 23, 119 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("st1w z20.d, p4, [%[base], z11.d, sxtw]"), + { /*zt=*/20, /*pg=*/4, /*zm=*/11 }, + std::array { -24, 120 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 32bit unscaled offset uxtw", + TEST_FUNC("st1w z21.s, p5, [%[base], z10.s, uxtw]"), + { /*zt=*/21, /*pg=*/5, /*zm=*/10 }, + std::array { 25, 121, 29, 125 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 32bit unscaled offset sxtw", + TEST_FUNC("st1w z22.s, p6, [%[base], z9.s, sxtw]"), + { /*zt=*/22, /*pg=*/6, /*zm=*/9 }, + std::array { -26, -122, 30, 126 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 32bit unscaled offset sxtw (repeated offset)", + TEST_FUNC("st1w z22.s, p6, [%[base], z9.s, sxtw]"), + { /*zt=*/22, /*pg=*/6, /*zm=*/9 }, + std::array { -27, -27, 30, 30 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 64bit scaled offset", + TEST_FUNC("st1w z23.d, p7, [%[base], z8.d, lsl #2]"), + { /*zt=*/23, /*pg=*/7, /*zm=*/8 }, + std::array { 28, 123 }, + element_size_t::SINGLE, + /*scaled=*/true, + }, + { + "st1w scalar+vector 64bit unscaled offset", + TEST_FUNC("st1w z24.d, p0, [%[base], z7.d]"), + { /*zt=*/24, /*pg=*/0, /*zm=*/7 }, + std::array { 29, 124 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + { + "st1w scalar+vector 64bit unscaled offset (repeated offset)", + TEST_FUNC("st1w z24.d, p0, [%[base], z7.d]"), + { /*zt=*/24, /*pg=*/0, /*zm=*/7 }, + std::array { 30, 30 }, + element_size_t::SINGLE, + /*scaled=*/false, + }, + // ST1D instructions. + { + "st1d scalar+vector 32bit unpacked scaled offset uxtw", + TEST_FUNC("st1d z25.d, p1, [%[base], z6.d, uxtw #3]"), + { /*zt=*/25, /*pg=*/1, /*zm=*/6 }, + std::array { 31, 125 }, + element_size_t::DOUBLE, + /*scaled=*/true, + }, + { + "st1d scalar+vector 32bit unpacked scaled offset sxtw", + TEST_FUNC("st1d z26.d, p2, [%[base], z5.d, sxtw #3]"), + { /*zt=*/26, /*pg=*/2, /*zm=*/5 }, + std::array { -32, 126 }, + element_size_t::DOUBLE, + /*scaled=*/true, + }, + { + "st1d scalar+vector 32bit unpacked unscaled offset uxtw", + TEST_FUNC("st1d z27.d, p3, [%[base], z4.d, uxtw]"), + { /*zt=*/27, /*pg=*/3, /*zm=*/4 }, + std::array { 33, 127 }, + element_size_t::DOUBLE, + /*scaled=*/false, + }, + { + "st1d scalar+vector 32bit unpacked unscaled offset sxtw", + TEST_FUNC("st1d z28.d, p4, [%[base], z3.d, sxtw]"), + { /*zt=*/28, /*pg=*/4, /*zm=*/3 }, + std::array { -34, 128 }, + element_size_t::DOUBLE, + /*scaled=*/false, + }, + { + "st1d scalar+vector 64bit scaled offset", + TEST_FUNC("st1d z29.d, p5, [%[base], z2.d, lsl #3]"), + { /*zt=*/29, /*pg=*/5, /*zm=*/2 }, + std::array { 36, 129 }, + element_size_t::DOUBLE, + /*scaled=*/true, + }, + { + "st1d scalar+vector 64bit unscaled offset", + TEST_FUNC("st1d z30.d, p6, [%[base], z1.d]"), + { /*zt=*/30, /*pg=*/6, /*zm=*/1 }, + std::array { 37, 130 }, + element_size_t::DOUBLE, + /*scaled=*/false, + }, + { + "st1d scalar+vector 64bit unscaled offset (repeated offset)", + TEST_FUNC("st1d z30.d, p6, [%[base], z1.d]"), + { /*zt=*/30, /*pg=*/6, /*zm=*/1 }, + std::array { 38, 38 }, + element_size_t::DOUBLE, + /*scaled=*/false, + }, + }); +# undef TEST_FUNC +} + +struct vector_plus_immediate_load_test_case_t + : public test_case_base_t { + vector_reg_value128_t reference_data_; + vector_reg_value128_t base_data_; + + struct registers_used_t { + unsigned dest_z; + unsigned governing_p; + unsigned base_z; + } registers_used_; + + template + vector_plus_immediate_load_test_case_t( + std::string name, test_func_t func, registers_used_t registers_used, + std::array reference_data, + std::array base) + : test_case_base_t( + std::move(name), std::move(func), registers_used.governing_p, + static_cast(sizeof(BASE_T)), SCATTER_GATHER_INSTRUCTION) + , registers_used_(registers_used) + + { + std::memcpy(reference_data_.data(), reference_data.data(), + reference_data_.size()); + std::memcpy(base_data_.data(), base.data(), base_data_.size()); + } + + virtual test_ptrs_t + setup(test_register_data_t ®ister_data, bool force_fault, + size_t faulting_element) override + { + // Set the value for the base vector register. + register_data.before.set_z_register_value(registers_used_.base_z, base_data_); + + if (force_fault) { + assert(element_size_ == element_size_t::DOUBLE); + + register_data.before.set_z_register_element(registers_used_.base_z, + faulting_element, + INPUT_DATA.faulting_base_addr(0)); + } + + return { + register_data.before.z.data(), + register_data.before.p.data(), + register_data.after.z.data(), + register_data.after.p.data(), + }; + } + + void + check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, + bool expected_fault, size_t faulting_element) override + { + const auto vl_bytes = get_vl_bytes(); + + if (!expected_fault) { + std::vector expected_output_data; + expected_output_data.resize(vl_bytes); + + assert(reference_data_.size() == TEST_VL_BYTES); + for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { + memcpy(&expected_output_data[TEST_VL_BYTES * i], reference_data_.data(), + TEST_VL_BYTES); + } + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + vl_bytes, + }; + + const auto output_value = + register_data.after.get_z_register_value(registers_used_.dest_z); + + if (output_value != expected_output) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + } + + // Check that the values of the other Z registers have been preserved. + for (size_t i = 0; i < NUM_Z_REGS; i++) { + if (i == registers_used_.dest_z && !expected_fault) + continue; + check_z_reg(i, register_data); + } + // Check that the values of the P registers have been preserved. + for (size_t i = 0; i < NUM_P_REGS; i++) { + check_p_reg(i, register_data); + } + check_ffr(register_data); + } +}; + +test_result_t +test_ld1_vector_plus_immediate() +{ +# define TEST_FUNC(ld_instruction) \ + [](vector_plus_immediate_load_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_FFR(p_restore_base) \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + ld_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ + } + + const auto get_base_ptr = [&](element_size_t element_size, size_t offset) { + void *start = INPUT_DATA.base_addr_for_data_size(element_size); + switch (element_size) { + case element_size_t::BYTE: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::HALF: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::SINGLE: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::DOUBLE: + return reinterpret_cast(&static_cast(start)[offset]); + } + assert(false); // unreachable + return uintptr_t(0); + }; + return run_tests({ + /* { + * Test name, + * Function that executes the test instruction, + * Registers used {zt, pg, zn}, + * Expected output data, + * Base data (value for zn), + * }, + */ + /* TODO i#5036: Add tests for 32-bit element variants. + * For example: ld1b z0.s, p0/z, [z31.s, #0]. + * These instructions require 32-bit base pointers and I'm not sure + * how we can reliably and portably guarantee that allocated memory + * has an address that fits into 32-bits. + */ + { + "ld1b vector+immediate 64bit element", + TEST_FUNC("ld1b z0.d, p0/z, [z31.d, #0]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0x00, 0x16 }, + std::array { + get_base_ptr(element_size_t::BYTE, 0), + get_base_ptr(element_size_t::BYTE, 16), + }, + }, + { + "ld1b vector+immediate 64bit element (max index)", + TEST_FUNC("ld1b z0.d, p0/z, [z31.d, #31]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0xf1, 0xf1 }, + std::array { + get_base_ptr(element_size_t::BYTE, 0), + get_base_ptr(element_size_t::BYTE, 0), + }, + }, + { + "ld1sb vector+immediate 64bit element", + TEST_FUNC("ld1sb z3.d, p1/z, [z27.d, #1]"), + { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, + std::array { 0x02, -15 }, + std::array { + get_base_ptr(element_size_t::BYTE, 1), + get_base_ptr(element_size_t::BYTE, 30), + }, + }, + { + "ld1sb vector+immediate 64bit element (max index)", + TEST_FUNC("ld1sb z3.d, p1/z, [z27.d, #31]"), + { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, + std::array { -15, -15 }, + std::array { + get_base_ptr(element_size_t::BYTE, 0), + get_base_ptr(element_size_t::BYTE, 0), + }, + }, + { + "ld1h vector+immediate 64bit element", + TEST_FUNC("ld1h z7.d, p2/z, [z23.d, #4]"), + { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, + std::array { 0x04, 0x20 }, + std::array { + get_base_ptr(element_size_t::HALF, 2), + get_base_ptr(element_size_t::HALF, 18), + }, + }, + { + "ld1h vector+immediate 64bit element (max index)", + TEST_FUNC("ld1h z7.d, p2/z, [z23.d, #62]"), + { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, + std::array { 0xfff1, 0xfff1 }, + std::array { + get_base_ptr(element_size_t::HALF, 0), + get_base_ptr(element_size_t::HALF, 0), + }, + }, + { + "ld1sh vector+immediate 64bit element", + TEST_FUNC("ld1sh z11.d, p3/z, [z19.d, #6]"), + { /*zt=*/11, /*pg=*/3, /*zn=*/19 }, + std::array { 0x06, -15 }, + std::array { + get_base_ptr(element_size_t::HALF, 3), + get_base_ptr(element_size_t::HALF, 28), + }, + }, + { + "ld1sh vector+immediate 64bit element (max index)", + TEST_FUNC("ld1sh z11.d, p3/z, [z19.d, #62]"), + { /*zt=*/11, /*pg=*/3, /*zn=*/19 }, + std::array { -15, -14 }, + std::array { + get_base_ptr(element_size_t::HALF, 0), + get_base_ptr(element_size_t::HALF, -1), + }, + }, + { + "ld1w vector+immediate 64bit element", + TEST_FUNC("ld1w z15.d, p4/z, [z15.d, #16]"), + { /*zt=*/15, /*pg=*/4, /*zn=*/15 }, + std::array { 0x08, 0xfffffff8 }, + std::array { + get_base_ptr(element_size_t::SINGLE, 4), + get_base_ptr(element_size_t::SINGLE, 20), + }, + }, + { + "ld1w vector+immediate 64bit element (max index)", + TEST_FUNC("ld1w z15.d, p4/z, [z15.d, #124]"), + { /*zt=*/15, /*pg=*/4, /*zn=*/15 }, + std::array { 0xfffffff1, 0xfffffff3 }, + std::array { + get_base_ptr(element_size_t::SINGLE, 0), + get_base_ptr(element_size_t::SINGLE, -2), + }, + }, + { + "ld1sw vector+immediate 64bit element", + TEST_FUNC("ld1sw z19.d, p5/z, [z11.d, #20]"), + { /*zt=*/19, /*pg=*/5, /*zn=*/11 }, + std::array { 0x10, -14 }, + std::array { + get_base_ptr(element_size_t::SINGLE, 5), + get_base_ptr(element_size_t::SINGLE, 25), + }, + }, + { + "ld1sw vector+immediate 64bit element (max index)", + TEST_FUNC("ld1sw z19.d, p5/z, [z11.d, #124]"), + { /*zt=*/19, /*pg=*/5, /*zn=*/11 }, + std::array { -9, -10 }, + std::array { + get_base_ptr(element_size_t::SINGLE, 26), + get_base_ptr(element_size_t::SINGLE, -5), + }, + }, + { + "ld1d vector+immediate 64bit element", + TEST_FUNC("ld1d z23.d, p6/z, [z7.d, #48]"), + { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, + std::array { 0x12, 0xfffffffffffffff4 }, + std::array { + get_base_ptr(element_size_t::DOUBLE, 6), + get_base_ptr(element_size_t::DOUBLE, 22), + }, + }, + { + "ld1d vector+immediate 64bit element (max index)", + TEST_FUNC("ld1d z23.d, p6/z, [z7.d, #248]"), + { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, + std::array { 0xfffffffffffffff1, 0xfffffffffffffff7 }, + std::array { + get_base_ptr(element_size_t::DOUBLE, 0), + get_base_ptr(element_size_t::DOUBLE, -6), + }, + }, + { + "ld1d vector+immediate 64bit element Zt==Zn", + TEST_FUNC("ld1d z27.d, p7/z, [z3.d, #0]"), + { /*zt=*/27, /*pg=*/7, /*zn=*/3 }, + std::array { 0x07, 0x23 }, + std::array { + get_base_ptr(element_size_t::DOUBLE, 7), + get_base_ptr(element_size_t::DOUBLE, 23), + }, + }, + }); +# undef TEST_FUNC +} + +struct vector_plus_immediate_first_fault_load_test_case_t + : public vector_plus_immediate_load_test_case_t { + + template + vector_plus_immediate_first_fault_load_test_case_t( + std::string name, test_func_t func, registers_used_t registers_used, + std::array reference_data, + std::array base) + : vector_plus_immediate_load_test_case_t(std::move(name), std::move(func), + registers_used, reference_data, base) + { + } + + void + check_fault(predicate_reg_value128_t pred, bool expected_fault, + size_t faulting_element, bool signal_handler_called) override + { + expected_fault = + expected_fault && first_active_element_faults(pred, faulting_element); + vector_plus_immediate_load_test_case_t::check_fault( + pred, expected_fault, faulting_element, signal_handler_called); + } + + void + check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, + bool expected_fault, size_t faulting_element) override + { + if (!expected_fault || first_active_element_faults(pred, faulting_element)) { + // If there is no faulting element, or the first active element faults, then + // this instruction behaves the same as a regular vector+immediate load. + vector_plus_immediate_load_test_case_t::check_output( + pred, register_data, expected_fault, faulting_element); + return; + } + + const auto vl_bytes = get_vl_bytes(); + + // Check the FFR value + const auto element_size_bytes = static_cast(element_size_); + const auto num_mask_elements = TEST_VL_BYTES / element_size_bytes; + + const auto original_ffr = register_data.before.get_ffr_value(); + predicate_reg_value128_t ffr_128 = 0; + memcpy(&ffr_128, original_ffr.data, sizeof(ffr_128)); + // All bits from the faulting element onwards are 0 so mask them out. + ffr_128 &= + (1 << ((faulting_element % num_mask_elements) * element_size_bytes)) - 1; + + std::vector expected_ffr_data(original_ffr.size, 0); + memcpy(expected_ffr_data.data(), original_ffr.data, + 2 * ((faulting_element * element_size_bytes) / 16)); + memcpy(&expected_ffr_data[2 * ((faulting_element * element_size_bytes) / 16)], + &ffr_128, sizeof(ffr_128)); + const scalable_reg_value_t expected_ffr { + expected_ffr_data.data(), + expected_ffr_data.size(), + }; + + const auto actual_ffr = register_data.after.get_ffr_value(); + + if (actual_ffr != expected_ffr) { + test_failed(); + print("predicate: "); + print_predicate( + register_data.before.get_p_register_value(registers_used_.governing_p)); + print("\noriginal ffr: "); + print_predicate(register_data.before.get_ffr_value()); + print("\nexpected ffr: "); + print_predicate(expected_ffr); + print("\nactual ffr: "); + print_predicate(actual_ffr); + print("\n"); + } + + const auto dest_z = registers_used_.dest_z; + + // Check destination register value. + if (faulting_element > 0) { + std::vector expected_output_data; + expected_output_data.resize(vl_bytes); + + assert(reference_data_.size() == TEST_VL_BYTES); + for (size_t i = 0; i < vl_bytes / TEST_VL_BYTES; i++) { + memcpy(&expected_output_data[TEST_VL_BYTES * i], reference_data_.data(), + TEST_VL_BYTES); + } + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + vl_bytes, + }; + + const auto output_value = register_data.after.get_z_register_value(dest_z); + + if (memcmp(expected_output.data, output_value.data, faulting_element) != 0) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + } + + // Check that the values of the other Z registers have been preserved. + for (size_t i = 0; i < NUM_Z_REGS; i++) { + if (i != dest_z) + check_z_reg(i, register_data); + } + // Check that the values of the P registers have been preserved. + for (size_t i = 0; i < NUM_P_REGS; i++) { + check_p_reg(i, register_data); + } + } +}; + +test_result_t +test_ldff1_vector_plus_immediate() +{ +# define TEST_FUNC(ld_instruction) \ + [](vector_plus_immediate_first_fault_load_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_FFR(p_restore_base) \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + ld_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ + } + + const auto get_base_ptr = [&](element_size_t element_size, size_t offset) { + void *start = INPUT_DATA.base_addr_for_data_size(element_size); + switch (element_size) { + case element_size_t::BYTE: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::HALF: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::SINGLE: + return reinterpret_cast(&static_cast(start)[offset]); + case element_size_t::DOUBLE: + return reinterpret_cast(&static_cast(start)[offset]); + } + assert(false); // unreachable + return uintptr_t(0); + }; + return run_tests({ + + { + "ldff1b vector+immediate 64bit element", + TEST_FUNC("ldff1b z0.d, p0/z, [z31.d, #0]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0x00, 0x16 }, + std::array { + get_base_ptr(element_size_t::BYTE, 0), + get_base_ptr(element_size_t::BYTE, 16), + }, + }, + { + "ldff1b vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1b z0.d, p0/z, [z31.d, #31]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0xf1, 0xf1 }, + std::array { get_base_ptr(element_size_t::BYTE, 0), get_base_ptr(element_size_t::BYTE, 0), }, }, { - "ld1h vector+immediate 64bit element", - TEST_FUNC("ld1h z7.d, p2/z, [z23.d, #4]"), + "ldff1sb vector+immediate 64bit element", + TEST_FUNC("ldff1sb z3.d, p1/z, [z27.d, #1]"), + { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, + std::array { 0x02, -15 }, + std::array { + get_base_ptr(element_size_t::BYTE, 1), + get_base_ptr(element_size_t::BYTE, 30), + }, + }, + { + "ldff1sb vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1sb z3.d, p1/z, [z27.d, #31]"), + { /*zt=*/3, /*pg=*/1, /*zn=*/27 }, + std::array { -15, -15 }, + std::array { + get_base_ptr(element_size_t::BYTE, 0), + get_base_ptr(element_size_t::BYTE, 0), + }, + }, + { + "ldff1h vector+immediate 64bit element", + TEST_FUNC("ldff1h z7.d, p2/z, [z23.d, #4]"), { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, std::array { 0x04, 0x20 }, std::array { @@ -2384,8 +3346,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1h vector+immediate 64bit element (max index)", - TEST_FUNC("ld1h z7.d, p2/z, [z23.d, #62]"), + "ldff1h vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1h z7.d, p2/z, [z23.d, #62]"), { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, std::array { 0xfff1, 0xfff1 }, std::array { @@ -2394,8 +3356,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1sh vector+immediate 64bit element", - TEST_FUNC("ld1sh z11.d, p3/z, [z19.d, #6]"), + "ldff1sh vector+immediate 64bit element", + TEST_FUNC("ldff1sh z11.d, p3/z, [z19.d, #6]"), { /*zt=*/11, /*pg=*/3, /*zn=*/19 }, std::array { 0x06, -15 }, std::array { @@ -2404,8 +3366,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1sh vector+immediate 64bit element (max index)", - TEST_FUNC("ld1sh z11.d, p3/z, [z19.d, #62]"), + "ldff1sh vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1sh z11.d, p3/z, [z19.d, #62]"), { /*zt=*/11, /*pg=*/3, /*zn=*/19 }, std::array { -15, -14 }, std::array { @@ -2414,8 +3376,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1w vector+immediate 64bit element", - TEST_FUNC("ld1w z15.d, p4/z, [z15.d, #16]"), + "ldff1w vector+immediate 64bit element", + TEST_FUNC("ldff1w z15.d, p4/z, [z15.d, #16]"), { /*zt=*/15, /*pg=*/4, /*zn=*/15 }, std::array { 0x08, 0xfffffff8 }, std::array { @@ -2424,8 +3386,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1w vector+immediate 64bit element (max index)", - TEST_FUNC("ld1w z15.d, p4/z, [z15.d, #124]"), + "ldff1w vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1w z15.d, p4/z, [z15.d, #124]"), { /*zt=*/15, /*pg=*/4, /*zn=*/15 }, std::array { 0xfffffff1, 0xfffffff3 }, std::array { @@ -2434,8 +3396,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1sw vector+immediate 64bit element", - TEST_FUNC("ld1sw z19.d, p5/z, [z11.d, #20]"), + "ldff1sw vector+immediate 64bit element", + TEST_FUNC("ldff1sw z19.d, p5/z, [z11.d, #20]"), { /*zt=*/19, /*pg=*/5, /*zn=*/11 }, std::array { 0x10, -14 }, std::array { @@ -2444,8 +3406,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1sw vector+immediate 64bit element (max index)", - TEST_FUNC("ld1sw z19.d, p5/z, [z11.d, #124]"), + "ldff1sw vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1sw z19.d, p5/z, [z11.d, #124]"), { /*zt=*/19, /*pg=*/5, /*zn=*/11 }, std::array { -9, -10 }, std::array { @@ -2453,9 +3415,9 @@ test_ld1_vector_plus_immediate() get_base_ptr(element_size_t::SINGLE, -5), }, }, - { - "ld1d vector+immediate 64bit element", - TEST_FUNC("ld1d z23.d, p6/z, [z7.d, #48]"), + { + "ldff1d vector+immediate 64bit element", + TEST_FUNC("ldff1d z23.d, p6/z, [z7.d, #48]"), { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, std::array { 0x12, 0xfffffffffffffff4 }, std::array { @@ -2464,8 +3426,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1d vector+immediate 64bit element (max index)", - TEST_FUNC("ld1d z23.d, p6/z, [z7.d, #248]"), + "ldff1d vector+immediate 64bit element (max index)", + TEST_FUNC("ldff1d z23.d, p6/z, [z7.d, #248]"), { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, std::array { 0xfffffffffffffff1, 0xfffffffffffffff7 }, std::array { @@ -2474,8 +3436,8 @@ test_ld1_vector_plus_immediate() }, }, { - "ld1d vector+immediate 64bit element Zt==Zn", - TEST_FUNC("ld1d z27.d, p7/z, [z3.d, #0]"), + "ldff1d vector+immediate 64bit element Zt==Zn", + TEST_FUNC("ldff1d z27.d, p7/z, [z3.d, #0]"), { /*zt=*/27, /*pg=*/7, /*zn=*/3 }, std::array { 0x07, 0x23 }, std::array { @@ -2495,54 +3457,309 @@ struct vector_plus_immediate_store_test_case_t struct registers_used_t { unsigned src_z; unsigned governing_p; - unsigned base_z; + unsigned base_z; + } registers_used_; + + element_size_t stored_value_size_; + + expected_values_t expected_values_; + + vector_plus_immediate_store_test_case_t(std::string name, test_func_t func, + registers_used_t registers_used, + std::array base_offsets, + element_size_t stored_value_size, + std::ptrdiff_t immediate_offset) + : test_case_base_t( + std::move(name), std::move(func), registers_used.governing_p, + element_size_t::DOUBLE, SCATTER_GATHER_INSTRUCTION) + , registers_used_(registers_used) + , stored_value_size_(stored_value_size) + , expected_values_( + std::array { immediate_offset, immediate_offset }, + stored_value_size) + { + base_ptrs_[0] = + static_cast(OUTPUT_DATA.base_addr()) + base_offsets[0]; + base_ptrs_[1] = + static_cast(OUTPUT_DATA.base_addr()) + base_offsets[1]; + std::memcpy(base_data_.data(), base_ptrs_.data(), base_data_.size()); + } + + virtual test_ptrs_t + setup(test_register_data_t ®ister_data, bool force_fault, + size_t faulting_element) override + { + // Set the value for the base register. + register_data.before.set_z_register_value(registers_used_.base_z, base_data_); + + if (force_fault) { + register_data.before.set_z_register_element(registers_used_.base_z, + faulting_element, + OUTPUT_DATA.faulting_base_addr()); + } + + register_data.before.set_z_register_value(registers_used_.src_z, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15 }); + OUTPUT_DATA.reset(); + + return { + register_data.before.z.data(), + register_data.before.p.data(), + register_data.after.z.data(), + register_data.after.p.data(), + }; + } + + void + check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, + bool expected_fault, size_t faulting_element) override + { + // Check that the values of the Z registers have been preserved. + for (size_t i = 0; i < NUM_Z_REGS; i++) { + check_z_reg(i, register_data); + } + // Check that the values of the P registers have been preserved. + for (size_t i = 0; i < NUM_P_REGS; i++) { + check_p_reg(i, register_data); + } + check_ffr(register_data); + + if (!expected_fault) { + const bool scaled = false; + assert(element_size_ == element_size_t::DOUBLE); + + switch (stored_value_size_) { + case element_size_t::BYTE: + check_expected_values(expected_values_.u8x2, pred, base_ptrs_, scaled); + break; + case element_size_t::HALF: + check_expected_values(expected_values_.u16x2, pred, base_ptrs_, scaled); + break; + case element_size_t::SINGLE: + check_expected_values(expected_values_.u32x2, pred, base_ptrs_, scaled); + break; + case element_size_t::DOUBLE: + check_expected_values(expected_values_.u64x2, pred, base_ptrs_, scaled); + break; + } + } + } +}; + +test_result_t +test_st1_vector_plus_immediate() +{ +# define TEST_FUNC(st_instruction) \ + [](vector_plus_immediate_load_test_case_t::test_ptrs_t &ptrs) { \ + asm(/* clang-format off */ \ + RESTORE_FFR(p_restore_base) \ + RESTORE_Z_REGISTERS(z_restore_base) \ + RESTORE_P_REGISTERS(p_restore_base) \ + st_instruction "\n" \ + SAVE_Z_REGISTERS(z_save_base) \ + SAVE_P_REGISTERS(p_save_base) \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ + } + + return run_tests({ + /* { + * Test name, + * Function that executes the test instruction, + * Registers used {zt, pg, zn}, + * Offsets + * Stored value size + * #imm index value + * }, + */ + /* TODO i#5036: Add tests for 32-bit element variants. + * For example: st1b z0.s, p0/z, [z31.s, #0]. + * These instructions require 32-bit base pointers and I'm not sure + * how we can reliably and portably guarantee that allocated memory + * has an address that fits into 32-bits. + */ + { + "st1b vector+immediate 64bit element", + TEST_FUNC("st1b z0.d, p0, [z31.d, #0]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0, 16 }, + element_size_t::BYTE, + 0, + }, + { + "st1b vector+immediate 64bit element (max index)", + TEST_FUNC("st1b z0.d, p0, [z31.d, #31]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0, 16 }, + element_size_t::BYTE, + 31, + }, + { + "st1b vector+immediate 64bit element (repeated base)", + TEST_FUNC("st1b z0.d, p0, [z31.d, #0]"), + { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, + std::array { 0, 0 }, + element_size_t::BYTE, + 0, + }, + { + "st1h vector+immediate 64bit element", + TEST_FUNC("st1h z7.d, p2, [z23.d, #4]"), + { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, + std::array { 2, 18 }, + element_size_t::HALF, + 4, + }, + { + "st1h vector+immediate 64bit element (max index)", + TEST_FUNC("st1h z7.d, p2, [z23.d, #62]"), + { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, + std::array { 2, 18 }, + element_size_t::HALF, + 62, + }, + { + "st1h vector+immediate 64bit element (repeated base)", + TEST_FUNC("st1h z7.d, p2, [z23.d, #4]"), + { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, + std::array { 19, 19 }, + element_size_t::HALF, + 4, + }, + { + "st1w vector+immediate 64bit element", + TEST_FUNC("st1w z15.d, p4, [z16.d, #16]"), + { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, + std::array { 4, 20 }, + element_size_t::SINGLE, + 16, + }, + { + "st1w vector+immediate 64bit element (max index)", + TEST_FUNC("st1w z15.d, p4, [z16.d, #124]"), + { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, + std::array { 4, 20 }, + element_size_t::SINGLE, + 124, + }, + { + "st1w vector+immediate 64bit element (repeated base)", + TEST_FUNC("st1w z15.d, p4, [z16.d, #16]"), + { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, + std::array { 21, 21 }, + element_size_t::SINGLE, + 16, + }, + { + "st1d vector+immediate 64bit element", + TEST_FUNC("st1d z23.d, p6, [z7.d, #48]"), + { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, + std::array { 6, 22 }, + element_size_t::DOUBLE, + 48, + }, + { + "st1d vector+immediate 64bit element (max index)", + TEST_FUNC("st1d z23.d, p6, [z7.d, #248]"), + { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, + std::array { 6, 22 }, + element_size_t::DOUBLE, + 248, + }, + { + "st1d vector+immediate 64bit element (repeated base)", + TEST_FUNC("st1d z23.d, p6, [z7.d, #48]"), + { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, + std::array { 23, 23 }, + element_size_t::DOUBLE, + 48, + }, + }); +# undef TEST_FUNC +} + +struct scalar_plus_scalar_test_ptrs_t : public basic_test_ptrs_t { + void *base; // Value used for the scalar base pointer. + int64_t index; // Value used for the scalar index value. + + scalar_plus_scalar_test_ptrs_t(void *base_, int64_t index_, + const void *z_restore_base_, + const void *p_restore_base_, void *z_save_base_, + void *p_save_base_) + : basic_test_ptrs_t { z_restore_base_, p_restore_base_, z_save_base_, + p_save_base_ } + , base(base_) + , index(index_) + { + } +}; + +template +struct scalar_plus_scalar_load_test_case_t + : public test_case_base_t { + std::array, NUM_ZT> reference_data_; + + struct registers_used_t { + std::array dest_z; + unsigned governing_p; } registers_used_; - element_size_t stored_value_size_; + void *base_; + int64_t index_; - expected_values_t expected_values_; + element_size_t data_size_; - vector_plus_immediate_store_test_case_t(std::string name, test_func_t func, - registers_used_t registers_used, - std::array base_offsets, - element_size_t stored_value_size, - std::ptrdiff_t immediate_offset) + size_t loaded_vector_size_; + + template + scalar_plus_scalar_load_test_case_t( + std::string name, test_func_t func, registers_used_t registers_used, + std::array, + NUM_ZT> + reference_data, + element_size_t data_size, int64_t index, + size_t loaded_vector_size = get_vl_bytes()) : test_case_base_t( std::move(name), std::move(func), registers_used.governing_p, - element_size_t::DOUBLE, SCATTER_GATHER_INSTRUCTION) + static_cast(sizeof(ELEMENT_T)), CONTIGUOUS_INSTRUCTION) , registers_used_(registers_used) - , stored_value_size_(stored_value_size) - , expected_values_( - std::array { immediate_offset, immediate_offset }, - stored_value_size) + , base_(INPUT_DATA.base_addr_for_data_size(data_size)) + , index_(index) + , data_size_(data_size) + , loaded_vector_size_(loaded_vector_size) { - base_ptrs_[0] = - static_cast(OUTPUT_DATA.base_addr()) + base_offsets[0]; - base_ptrs_[1] = - static_cast(OUTPUT_DATA.base_addr()) + base_offsets[1]; - std::memcpy(base_data_.data(), base_ptrs_.data(), base_data_.size()); + const auto vl_bytes = get_vl_bytes(); + static constexpr size_t REG_DATA_SIZE = + MAX_SUPPORTED_VL_BYTES / sizeof(ELEMENT_T); + for (size_t i = 0; i < NUM_ZT; i++) { + reference_data_[i].resize(vl_bytes); + memcpy(reference_data_[i].data(), reference_data[i].data(), vl_bytes); + } } virtual test_ptrs_t setup(test_register_data_t ®ister_data, bool force_fault, size_t faulting_element) override { - // Set the value for the base register. - register_data.before.set_z_register_value(registers_used_.base_z, base_data_); + // No Z/P registers to set up. + void *base = base_; if (force_fault) { - register_data.before.set_z_register_element(registers_used_.base_z, - faulting_element, - OUTPUT_DATA.faulting_base_addr()); + const auto element_bytes = static_cast(data_size_); + base = INPUT_DATA.faulting_base_addr((index_ + (NUM_ZT * faulting_element)) * + element_bytes); } - register_data.before.set_z_register_value(registers_used_.src_z, - { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, - 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, - 0x12, 0x13, 0x14, 0x15 }); - OUTPUT_DATA.reset(); - return { + base, + index_, register_data.before.z.data(), register_data.before.p.data(), register_data.after.z.data(), @@ -2554,315 +3771,523 @@ struct vector_plus_immediate_store_test_case_t check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, bool expected_fault, size_t faulting_element) override { - // Check that the values of the Z registers have been preserved. + if (!expected_fault) { + for (size_t i = 0; i < NUM_ZT; i++) { + std::vector expected_output_data(reference_data_[i]); + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + expected_output_data.size(), + }; + + const auto output_value = + register_data.after.get_z_register_value(registers_used_.dest_z[i]); + + if (output_value != expected_output) { + test_failed(); + if (NUM_ZT > 1) + print("Zt%u:\n", (unsigned)i); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); + } + } + } + + // Check that the values of the other Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { - check_z_reg(i, register_data); + if (expected_fault || + std::find(registers_used_.dest_z.begin(), registers_used_.dest_z.end(), + i) == registers_used_.dest_z.end()) + check_z_reg(i, register_data); } // Check that the values of the P registers have been preserved. for (size_t i = 0; i < NUM_P_REGS; i++) { check_p_reg(i, register_data); } check_ffr(register_data); + } - if (!expected_fault) { - const bool scaled = false; - assert(element_size_ == element_size_t::DOUBLE); - - switch (stored_value_size_) { - case element_size_t::BYTE: - check_expected_values(expected_values_.u8x2, pred, base_ptrs_, scaled); - break; - case element_size_t::HALF: - check_expected_values(expected_values_.u16x2, pred, base_ptrs_, scaled); - break; - case element_size_t::SINGLE: - check_expected_values(expected_values_.u32x2, pred, base_ptrs_, scaled); - break; - case element_size_t::DOUBLE: - check_expected_values(expected_values_.u64x2, pred, base_ptrs_, scaled); - break; - } - } + size_t + num_values_accessed() const override + { + return loaded_vector_size_ / static_cast(element_size_); } }; test_result_t -test_st1_vector_plus_immediate() +test_ld1_scalar_plus_scalar() { -# define TEST_FUNC(st_instruction) \ - [](vector_plus_immediate_load_test_case_t::test_ptrs_t &ptrs) { \ +# define TEST_FUNC(ld_instruction) \ + [](scalar_plus_scalar_load_test_case_t<1>::test_ptrs_t &ptrs) { \ asm(/* clang-format off */ \ RESTORE_FFR(p_restore_base) \ RESTORE_Z_REGISTERS(z_restore_base) \ RESTORE_P_REGISTERS(p_restore_base) \ - st_instruction "\n" \ + ld_instruction "\n" \ SAVE_Z_REGISTERS(z_save_base) \ SAVE_P_REGISTERS(p_save_base) \ SAVE_FFR(p_save_base) /* clang-format on */ \ : \ - : [z_restore_base] "r"(ptrs.z_restore_base), \ + : [base] "r"(ptrs.base), [index] "r"(ptrs.index), \ + [z_restore_base] "r"(ptrs.z_restore_base), \ [z_save_base] "r"(ptrs.z_save_base), \ [p_restore_base] "r"(ptrs.p_restore_base), \ [p_save_base] "r"(ptrs.p_save_base) \ : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ } - return run_tests({ + return run_tests>({ /* { * Test name, * Function that executes the test instruction, - * Registers used {zt, pg, zn}, - * Offsets - * Stored value size - * #imm index value + * Registers used {zt, pg, zm}, + * Expected output data, + * Base pointer (value for Xn), + * Index (value for Xm), * }, */ - /* TODO i#5036: Add tests for 32-bit element variants. - * For example: st1b z0.s, p0/z, [z31.s, #0]. - * These instructions require 32-bit base pointers and I'm not sure - * how we can reliably and portably guarantee that allocated memory - * has an address that fits into 32-bits. - */ + // LD1B instructions. { - "st1b vector+immediate 64bit element", - TEST_FUNC("st1b z0.d, p0, [z31.d, #0]"), - { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, - std::array { 0, 16 }, + "ld1b scalar+scalar 8bit element", + TEST_FUNC("ld1b z4.b, p7/z, [%[base], %[index]]"), + { /*zt=*/ { 4 }, /*pg=*/7 }, + std::array, 1> { { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, + 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, + } }, element_size_t::BYTE, - 0, + /*index=*/0, + }, + { + "ld1b scalar+scalar 16bit element", + TEST_FUNC("ld1b z8.h, p6/z, [%[base], %[index]]"), + { /*zt=*/ { 8 }, /*pg=*/6 }, + std::array, 1> { + { 0x00f1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, + 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, + 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, + 0x0023, 0x00f8, 0x00f7, 0x00f6, 0x00f5, 0x00f4, 0x00f3, 0x00f2 } }, + element_size_t::BYTE, + /*index=*/-1, + }, + { + "ld1b scalar+scalar 32bit element", + TEST_FUNC("ld1b z12.s, p5/z, [%[base], %[index]]"), + { /*zt=*/ { 12 }, /*pg=*/5 }, + std::array, 1> { + { 0x000005, 0x000006, 0x000007, 0x000008, 0x000009, 0x000010, 0x000011, + 0x000012, 0x000013, 0x000014, 0x000015, 0x000016, 0x000017, 0x000018, + 0x000019, 0x000020 } }, + element_size_t::BYTE, + /*index=*/5, + }, + { + "ld1b scalar+scalar 64bit element", + TEST_FUNC("ld1b z16.d, p4/z, [%[base], %[index]]"), + { /*zt=*/ { 16 }, /*pg=*/4 }, + std::array, 1> { + { 0x00000000000009, 0x00000000000010, 0x00000000000011, 0x00000000000012, + 0x00000000000013, 0x00000000000014, 0x00000000000015, + 0x00000000000016 } }, + element_size_t::BYTE, + /*index=*/9, + }, + { + "ldnt1b scalar+scalar", + TEST_FUNC("ldnt1b z20.b, p3/z, [%[base], %[index]]"), + { /*zt=*/ { 20 }, /*pg=*/3 }, + std::array, 1> { + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, + 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, + 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, + 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, + 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } }, + element_size_t::BYTE, + /*index=*/0, + }, + // LD1SB + { + "ld1sb scalar+scalar 16bit element", + TEST_FUNC("ld1sb z24.h, p2/z, [%[base], %[index]]"), + { /*zt=*/ { 24 }, /*pg=*/2 }, + std::array, 1> { + { 0xfff3, 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, + 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, + 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, + 0x0021, 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4 } }, + element_size_t::BYTE, + /*index=*/-3, + }, + { + "ld1sb scalar+scalar 32bit element", + TEST_FUNC("ld1sb z28.s, p1/z, [%[base], %[index]]"), + { /*zt=*/ { 28 }, /*pg=*/1 }, + std::array, 1> { + { 0x000005, 0x000006, 0x000007, 0x000008, 0x000009, 0x000010, 0x000011, + 0x000012, 0x000013, 0x000014, 0x000015, 0x000016, 0x000017, 0x000018, + 0x000019, 0x000020 } }, + element_size_t::BYTE, + /*index=*/5, + }, + { + "ld1sb scalar+scalar 64bit element", + TEST_FUNC("ld1sb z31.d, p0/z, [%[base], %[index]]"), + { /*zt=*/ { 31 }, /*pg=*/0 }, + std::array, 1> { { -12, -13, -14, -15, 0, 1, 2, 3 } }, + element_size_t::BYTE, + /*index=*/28, + }, + // LD1H + { + "ld1h scalar+scalar 16bit element", + TEST_FUNC("ld1h z27.h, p1/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 27 }, /*pg=*/1 }, + std::array, 1> { + { 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, + 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, + 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005 } }, + element_size_t::HALF, + /*index=*/6, }, { - "st1b vector+immediate 64bit element (max index)", - TEST_FUNC("st1b z0.d, p0, [z31.d, #31]"), - { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, - std::array { 0, 16 }, - element_size_t::BYTE, - 31, + "ld1h scalar+scalar 32bit element", + TEST_FUNC("ld1h z23.s, p2/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 23 }, /*pg=*/2 }, + std::array, 1> { + { 0x00000009, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, + 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x00000020, + 0x00000021, 0x00000022, 0x00000023, 0x0000fff8 } }, + element_size_t::HALF, + /*index=*/9, }, { - "st1b vector+immediate 64bit element (repeated base)", - TEST_FUNC("st1b z0.d, p0, [z31.d, #0]"), - { /*zt=*/0, /*pg=*/0, /*zn=*/31 }, - std::array { 0, 0 }, - element_size_t::BYTE, - 0, + "ld1h scalar+scalar 64bit element", + TEST_FUNC("ld1h z19.d, p3/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 19 }, /*pg=*/3 }, + std::array, 1> { + { 0x000000000000fff2, 0x000000000000fff1, 0x0000000000000000, + 0x0000000000000001, 0x0000000000000002, 0x0000000000000003, + 0x0000000000000004, 0x0000000000000005 } }, + element_size_t::HALF, + /*index=*/-2, }, { - "st1h vector+immediate 64bit element", - TEST_FUNC("st1h z7.d, p2, [z23.d, #4]"), - { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, - std::array { 2, 18 }, + "ldnt1h scalar+scalar", + TEST_FUNC("ldnt1h z15.h, p4/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 15 }, /*pg=*/4 }, + std::array, 1> { + { 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, + 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, + 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, + 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005 } }, element_size_t::HALF, - 4, + /*index=*/6, }, + // LD1SH { - "st1h vector+immediate 64bit element (max index)", - TEST_FUNC("st1h z7.d, p2, [z23.d, #62]"), - { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, - std::array { 2, 18 }, + "ld1sh scalar+scalar 32bit element", + TEST_FUNC("ld1sh z11.s, p5/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 11 }, /*pg=*/5 }, + std::array, 1> { + { 0x00000009, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, + 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x00000020, + 0x00000021, 0x00000022, 0x00000023, 0xfffffff8 } }, element_size_t::HALF, - 62, + /*index=*/9, }, { - "st1h vector+immediate 64bit element (repeated base)", - TEST_FUNC("st1h z7.d, p2, [z23.d, #4]"), - { /*zt=*/7, /*pg=*/2, /*zn=*/23 }, - std::array { 19, 19 }, + "ld1sh scalar+scalar 64bit element", + TEST_FUNC("ld1sh z7.d, p6/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 7 }, /*pg=*/6 }, + std::array, 1> { + { 0xfffffffffffffff2, 0xfffffffffffffff1, 0x0000000000000000, + 0x0000000000000001, 0x0000000000000002, 0x0000000000000003, + 0x0000000000000004, 0x0000000000000005 } }, element_size_t::HALF, - 4, + /*index=*/-2, }, + // LD1W { - "st1w vector+immediate 64bit element", - TEST_FUNC("st1w z15.d, p4, [z16.d, #16]"), - { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, - std::array { 4, 20 }, + "ld1w scalar+scalar 32bit element", + TEST_FUNC("ld1w z3.s, p7/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 3 }, /*pg=*/7 }, + std::array, 1> { + { 0x00000017, 0x00000018, 0x00000019, 0x00000020, 0x00000021, 0x00000022, + 0x00000023, 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, 0xfffffff4, + 0xfffffff3, 0xfffffff2, 0xfffffff1, 0x00000000 } }, element_size_t::SINGLE, - 16, + /*index=*/17, }, { - "st1w vector+immediate 64bit element (max index)", - TEST_FUNC("st1w z15.d, p4, [z16.d, #124]"), - { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, - std::array { 4, 20 }, + "ld1w scalar+scalar 64bit element", + TEST_FUNC("ld1w z1.d, p6/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 1 }, /*pg=*/6 }, + std::array, 1> { + { 0x00000000fffffff1, 0x0000000000000000, 0x0000000000000001, + 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, + 0x0000000000000005, 0x0000000000000006 } }, element_size_t::SINGLE, - 124, + /*index=*/-1, }, { - "st1w vector+immediate 64bit element (repeated base)", - TEST_FUNC("st1w z15.d, p4, [z16.d, #16]"), - { /*zt=*/15, /*pg=*/4, /*zn=*/16 }, - std::array { 21, 21 }, + "ldnt1w scalar+scalar", + TEST_FUNC("ldnt1w z5.s, p5/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 5 }, /*pg=*/5 }, + std::array, 1> { + { 0x00000018, 0x00000019, 0x00000020, 0x00000021, 0x00000022, 0x00000023, + 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, 0xfffffff4, 0xfffffff3, + 0xfffffff2, 0xfffffff1, 0x00000000, 0x00000001 } }, element_size_t::SINGLE, - 16, + /*index=*/18, }, + // LD1SW { - "st1d vector+immediate 64bit element", - TEST_FUNC("st1d z23.d, p6, [z7.d, #48]"), - { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, - std::array { 6, 22 }, + "ld1sw scalar+scalar", + TEST_FUNC("ld1sw z9.d, p4/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 9 }, /*pg=*/4 }, + std::array, 1> { + { 0xfffffffffffffff1, 0x0000000000000000, 0x0000000000000001, + 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, + 0x0000000000000005, 0x0000000000000006 } }, + element_size_t::SINGLE, + /*index=*/-1, + }, + // LD1D + { + "ld1d scalar+scalar", + TEST_FUNC("ld1d z13.d, p3/z, [%[base], %[index], lsl #3]"), + { /*zt=*/ { 13 }, /*pg=*/3 }, + std::array, 1> { + { 0x0000000000000008, 0x0000000000000009, 0x0000000000000010, + 0x0000000000000011, 0x0000000000000012, 0x0000000000000013, + 0x0000000000000014, 0x0000000000000015 } }, element_size_t::DOUBLE, - 48, + /*index=*/8, }, { - "st1d vector+immediate 64bit element (max index)", - TEST_FUNC("st1d z23.d, p6, [z7.d, #248]"), - { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, - std::array { 6, 22 }, + "ldnt1d scalar+scalar", + TEST_FUNC("ldnt1d z17.d, p2/z, [%[base], %[index], lsl #3]"), + { /*zt=*/ { 17 }, /*pg=*/2 }, + std::array, 1> { + { 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, + 0x0000000000000005, 0x0000000000000006, 0x0000000000000007, + 0x0000000000000008, 0x0000000000000009 } }, element_size_t::DOUBLE, - 248, + /*index=*/2, }, + // Load and replicate instructions { - "st1d vector+immediate 64bit element (repeated base)", - TEST_FUNC("st1d z23.d, p6, [z7.d, #48]"), - { /*zt=*/23, /*pg=*/6, /*zn=*/7 }, - std::array { 23, 23 }, + "ld1rqb scalar+scalar", + TEST_FUNC("ld1rqb z21.b, p1/z, [%[base], %[index]]"), + { /*zt=*/ { 21 }, /*pg=*/1 }, + std::array, 1> { + { 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, + 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, + 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, + 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21 } }, + element_size_t::BYTE, + /*index=*/6, + /*loaded_vector_size=*/16, + }, + { + "ld1rqh scalar+scalar", + TEST_FUNC("ld1rqh z25.h, p0/z, [%[base], %[index], lsl #1]"), + { /*zt=*/ { 25 }, /*pg=*/0 }, + std::array, 1> { + { 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019 } }, + element_size_t::HALF, + /*index=*/12, + /*loaded_vector_size=*/16, + }, + { + "ld1rqw scalar+scalar", + TEST_FUNC("ld1rqw z29.s, p1/z, [%[base], %[index], lsl #2]"), + { /*zt=*/ { 29 }, /*pg=*/1 }, + std::array, 1> { + { 0x00000020, 0x00000021, 0x00000022, 0x00000023, 0x00000020, 0x00000021, + 0x00000022, 0x00000023, 0x00000020, 0x00000021, 0x00000022, 0x00000023, + 0x00000020, 0x00000021, 0x00000022, 0x00000023 } }, + element_size_t::SINGLE, + /*index=*/-12, + /*loaded_vector_size=*/16, + }, + { + "ld1rqd scalar+scalar", + TEST_FUNC("ld1rqd z31.d, p2/z, [%[base], %[index], lsl #3]"), + { /*zt=*/ { 31 }, /*pg=*/2 }, + std::array, 1> { + { 0xfffffffffffffff6, 0xfffffffffffffff5, 0xfffffffffffffff6, + 0xfffffffffffffff5, 0xfffffffffffffff6, 0xfffffffffffffff5, + 0xfffffffffffffff6, 0xfffffffffffffff5 } }, element_size_t::DOUBLE, - 48, + /*index=*/-6, + /*loaded_vector_size=*/16, }, }); # undef TEST_FUNC } -struct scalar_plus_scalar_test_ptrs_t : public basic_test_ptrs_t { - void *base; // Value used for the scalar base pointer. - int64_t index; // Value used for the scalar index value. - - scalar_plus_scalar_test_ptrs_t(void *base_, int64_t index_, - const void *z_restore_base_, - const void *p_restore_base_, void *z_save_base_, - void *p_save_base_) - : basic_test_ptrs_t { z_restore_base_, p_restore_base_, z_save_base_, - p_save_base_ } - , base(base_) - , index(index_) - { - } -}; - -template -struct scalar_plus_scalar_load_test_case_t - : public test_case_base_t { - std::array, NUM_ZT> reference_data_; - - struct registers_used_t { - std::array dest_z; - unsigned governing_p; - } registers_used_; - - void *base_; - int64_t index_; +struct scalar_plus_scalar_first_fault_load_test_case_t + : public scalar_plus_scalar_load_test_case_t<1> { - element_size_t data_size_; - - size_t loaded_vector_size_; + std::vector reference_data_fault_; - template - scalar_plus_scalar_load_test_case_t( - std::string name, test_func_t func, registers_used_t registers_used, - std::array, - NUM_ZT> - reference_data, - element_size_t data_size, int64_t index, - size_t loaded_vector_size = get_vl_bytes()) - : test_case_base_t( - std::move(name), std::move(func), registers_used.governing_p, - static_cast(sizeof(ELEMENT_T)), CONTIGUOUS_INSTRUCTION) - , registers_used_(registers_used) - , base_(INPUT_DATA.base_addr_for_data_size(data_size)) - , index_(index) - , data_size_(data_size) - , loaded_vector_size_(loaded_vector_size) + template + scalar_plus_scalar_first_fault_load_test_case_t( + std::string name, test_func_t func, registers_used_t registers_used, + std::array reference_data, + std::array reference_data_fault_128, + std::array reference_data_fault_256, + std::array reference_data_fault_512, + element_size_t data_size, int64_t index, + size_t loaded_vector_size = get_vl_bytes()) + : scalar_plus_scalar_load_test_case_t<1>( + std::move(name), std::move(func), registers_used, + std::array { reference_data }, data_size, + index) { const auto vl_bytes = get_vl_bytes(); - static constexpr size_t REG_DATA_SIZE = - MAX_SUPPORTED_VL_BYTES / sizeof(ELEMENT_T); - for (size_t i = 0; i < NUM_ZT; i++) { - reference_data_[i].resize(vl_bytes); - memcpy(reference_data_[i].data(), reference_data[i].data(), vl_bytes); + reference_data_fault_.resize(vl_bytes); + switch (vl_bytes) { + case 16: + assert(reference_data_fault_128.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_fault_128.data(), + vl_bytes); + break; + case 32: + assert(reference_data_fault_256.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_fault_256.data(), + vl_bytes); + break; + case 64: + assert(reference_data_fault_512.size() * sizeof(ELEMENT_T) == vl_bytes); + memcpy(reference_data_fault_.data(), reference_data_fault_512.data(), + vl_bytes); + break; + default: print("Unsupported vector length: %lu\n", vl_bytes); exit(1); } } - virtual test_ptrs_t - setup(test_register_data_t ®ister_data, bool force_fault, - size_t faulting_element) override + void + check_fault(predicate_reg_value128_t pred, bool expected_fault, + size_t faulting_element, bool signal_handler_called) override { - // No Z/P registers to set up. - - void *base = base_; - if (force_fault) { - const auto element_bytes = static_cast(data_size_); - base = INPUT_DATA.faulting_base_addr((index_ + (NUM_ZT * faulting_element)) * - element_bytes); - } - - return { - base, - index_, - register_data.before.z.data(), - register_data.before.p.data(), - register_data.after.z.data(), - register_data.after.p.data(), - }; + expected_fault = + expected_fault && first_active_element_faults(pred, faulting_element); + scalar_plus_scalar_load_test_case_t<1>::check_fault( + pred, expected_fault, faulting_element, signal_handler_called); } void check_output(predicate_reg_value128_t pred, const test_register_data_t ®ister_data, bool expected_fault, size_t faulting_element) override { - if (!expected_fault) { - for (size_t i = 0; i < NUM_ZT; i++) { - std::vector expected_output_data(reference_data_[i]); - apply_predicate_mask(expected_output_data, pred, element_size_); - const scalable_reg_value_t expected_output { - expected_output_data.data(), - expected_output_data.size(), - }; + if (!expected_fault || first_active_element_faults(pred, faulting_element)) { + // If there is no faulting element, or the first active element faults, then + // this instruction behaves the same as a regular scalar+vector load. + scalar_plus_scalar_load_test_case_t<1>::check_output( + pred, register_data, expected_fault, faulting_element); + return; + } - const auto output_value = - register_data.after.get_z_register_value(registers_used_.dest_z[i]); + // Check the FFR value + const auto element_size_bytes = static_cast(element_size_); + const auto num_mask_elements = TEST_VL_BYTES / element_size_bytes; - if (output_value != expected_output) { - test_failed(); - if (NUM_ZT > 1) - print("Zt%u:\n", (unsigned)i); - print("predicate: "); - print_predicate(register_data.before.get_p_register_value( - registers_used_.governing_p)); - print("\nexpected: "); - print_vector(expected_output); - print("\nactual: "); - print_vector(output_value); - print("\n"); - } + const auto original_ffr = register_data.before.get_ffr_value(); + predicate_reg_value128_t ffr_128 = 0; + memcpy(&ffr_128, original_ffr.data, sizeof(ffr_128)); + // All bits from the faulting element onwards are 0 so mask them out. + ffr_128 &= + (1 << ((faulting_element % num_mask_elements) * element_size_bytes)) - 1; + + std::vector expected_ffr_data(original_ffr.size, 0); + memcpy(expected_ffr_data.data(), original_ffr.data, + 2 * ((faulting_element * element_size_bytes) / 16)); + memcpy(&expected_ffr_data[2 * ((faulting_element * element_size_bytes) / 16)], + &ffr_128, sizeof(ffr_128)); + const scalable_reg_value_t expected_ffr { + expected_ffr_data.data(), + expected_ffr_data.size(), + }; + + const auto actual_ffr = register_data.after.get_ffr_value(); + + if (actual_ffr != expected_ffr) { + test_failed(); + print("predicate: "); + print_predicate( + register_data.before.get_p_register_value(registers_used_.governing_p)); + print("\noriginal ffr: "); + print_predicate(register_data.before.get_ffr_value()); + print("\nexpected ffr: "); + print_predicate(expected_ffr); + print("\nactual ffr: "); + print_predicate(actual_ffr); + print("\n"); + } + + assert(registers_used_.dest_z.size() == 1); + const auto dest_z = registers_used_.dest_z[0]; + + // Check destination register value. + if (faulting_element > 0) { + std::vector expected_output_data(reference_data_fault_); + apply_predicate_mask(expected_output_data, pred, element_size_); + const scalable_reg_value_t expected_output { + expected_output_data.data(), + expected_output_data.size(), + }; + + const auto output_value = register_data.after.get_z_register_value(dest_z); + + if (memcmp(expected_output.data, output_value.data, faulting_element) != 0) { + test_failed(); + print("predicate: "); + print_predicate(register_data.before.get_p_register_value( + registers_used_.governing_p)); + print("\nexpected: "); + print_vector(expected_output); + print("\nactual: "); + print_vector(output_value); + print("\n"); } } // Check that the values of the other Z registers have been preserved. for (size_t i = 0; i < NUM_Z_REGS; i++) { - if (expected_fault || - std::find(registers_used_.dest_z.begin(), registers_used_.dest_z.end(), - i) == registers_used_.dest_z.end()) + if (i != dest_z) check_z_reg(i, register_data); } // Check that the values of the P registers have been preserved. for (size_t i = 0; i < NUM_P_REGS; i++) { check_p_reg(i, register_data); } - check_ffr(register_data); - } - - size_t - num_values_accessed() const override - { - return loaded_vector_size_ / static_cast(element_size_); } }; test_result_t -test_ld1_scalar_plus_scalar() +test_ldff1_scalar_plus_scalar() { -# define TEST_FUNC(ld_instruction) \ - [](scalar_plus_scalar_load_test_case_t<1>::test_ptrs_t &ptrs) { \ +# define TEST_FUNC(ld_instruction) \ + [](scalar_plus_scalar_first_fault_load_test_case_t::test_ptrs_t &ptrs) { \ asm(/* clang-format off */ \ RESTORE_FFR(p_restore_base) \ RESTORE_Z_REGISTERS(z_restore_base) \ @@ -2870,313 +4295,316 @@ test_ld1_scalar_plus_scalar() ld_instruction "\n" \ SAVE_Z_REGISTERS(z_save_base) \ SAVE_P_REGISTERS(p_save_base) \ - SAVE_FFR(p_save_base) /* clang-format on */ \ - : \ - : [base] "r"(ptrs.base), [index] "r"(ptrs.index), \ - [z_restore_base] "r"(ptrs.z_restore_base), \ - [z_save_base] "r"(ptrs.z_save_base), \ - [p_restore_base] "r"(ptrs.p_restore_base), \ - [p_save_base] "r"(ptrs.p_save_base) \ - : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ + SAVE_FFR(p_save_base) /* clang-format on */ \ + : \ + : [base] "r"(ptrs.base), [index] "r"(ptrs.index), \ + [z_restore_base] "r"(ptrs.z_restore_base), \ + [z_save_base] "r"(ptrs.z_save_base), \ + [p_restore_base] "r"(ptrs.p_restore_base), \ + [p_save_base] "r"(ptrs.p_save_base) \ + : ALL_Z_REGS, ALL_P_REGS _FFR, "memory"); \ } - return run_tests>({ + return run_tests({ /* { * Test name, * Function that executes the test instruction, * Registers used {zt, pg, zm}, - * Expected output data, + * Expected output data if no fault, + * Expected output data if fault, * Base pointer (value for Xn), * Index (value for Xm), * }, */ - // LD1B instructions. + // LDFF1B instructions { - "ld1b scalar+scalar 8bit element", - TEST_FUNC("ld1b z4.b, p7/z, [%[base], %[index]]"), - { /*zt=*/ { 4 }, /*pg=*/7 }, - std::array, 1> { { + "ldff1b scalar+scalar 8bit element", + TEST_FUNC("ldff1b z0.b, p7/z, [%[base], %[index]]"), + { /*zt=*/0, /*pg=*/7 }, + std::array { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, - 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, - } }, + 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 }, + std::array { 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + std::array { 0xf2, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + std::array { + 0xf4, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xf3, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xf2, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xf1, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, element_size_t::BYTE, /*index=*/0, }, { - "ld1b scalar+scalar 16bit element", - TEST_FUNC("ld1b z8.h, p6/z, [%[base], %[index]]"), - { /*zt=*/ { 8 }, /*pg=*/6 }, - std::array, 1> { - { 0x00f1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, - 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, - 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, - 0x0023, 0x00f8, 0x00f7, 0x00f6, 0x00f5, 0x00f4, 0x00f3, 0x00f2 } }, - element_size_t::BYTE, - /*index=*/-1, - }, - { - "ld1b scalar+scalar 32bit element", - TEST_FUNC("ld1b z12.s, p5/z, [%[base], %[index]]"), - { /*zt=*/ { 12 }, /*pg=*/5 }, - std::array, 1> { - { 0x000005, 0x000006, 0x000007, 0x000008, 0x000009, 0x000010, 0x000011, - 0x000012, 0x000013, 0x000014, 0x000015, 0x000016, 0x000017, 0x000018, - 0x000019, 0x000020 } }, - element_size_t::BYTE, - /*index=*/5, - }, - { - "ld1b scalar+scalar 64bit element", - TEST_FUNC("ld1b z16.d, p4/z, [%[base], %[index]]"), - { /*zt=*/ { 16 }, /*pg=*/4 }, - std::array, 1> { - { 0x00000000000009, 0x00000000000010, 0x00000000000011, 0x00000000000012, - 0x00000000000013, 0x00000000000014, 0x00000000000015, - 0x00000000000016 } }, - element_size_t::BYTE, - /*index=*/9, - }, - { - "ldnt1b scalar+scalar", - TEST_FUNC("ldnt1b z20.b, p3/z, [%[base], %[index]]"), - { /*zt=*/ { 20 }, /*pg=*/3 }, - std::array, 1> { - { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, - 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, - 0x22, 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1, 0x00, - 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x22, - 0x23, 0xf8, 0xf7, 0xf6, 0xf5, 0xf4, 0xf3, 0xf2, 0xf1 } }, + "ldff1b scalar+scalar 16bit element", + TEST_FUNC("ldff1b z1.h, p6/z, [%[base], %[index]]"), + { /*zt=*/1, /*pg=*/6 }, + std::array { + 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, + 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, + 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, 0x0023, 0x00f8, + 0x00f7, 0x00f6, 0x00f5, 0x00f4, 0x00f3, 0x00f2, 0x00f1, 0x0000 }, + std::array { 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x0000, 0x0000, + 0x0000, 0x0000 }, + std::array { 0x00f1, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00ff, 0x00ff, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000 }, + std::array { + 0x00f2, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x00f1, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, 0x00ff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, element_size_t::BYTE, - /*index=*/0, + /*index=*/1, }, - // LD1SB { - "ld1sb scalar+scalar 16bit element", - TEST_FUNC("ld1sb z24.h, p2/z, [%[base], %[index]]"), - { /*zt=*/ { 24 }, /*pg=*/2 }, - std::array, 1> { - { 0xfff3, 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, - 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, - 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, - 0x0021, 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4 } }, + "ldff1b scalar+scalar 32bit element", + TEST_FUNC("ldff1b z2.s, p5/z, [%[base], %[index]]"), + { /*zt=*/2, /*pg=*/5 }, + std::array { 0x000000f2, 0x000000f1, 0x00000000, 0x00000001, + 0x00000002, 0x00000003, 0x00000004, 0x00000005, + 0x00000006, 0x00000007, 0x00000008, 0x00000009, + 0x00000010, 0x00000011, 0x00000012, 0x00000013 }, + std::array { 0x000000ff, 0x000000ff, 0x00000000, 0x00000000 }, + std::array { 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0x000000f1, 0x000000ff, 0x000000ff, 0x000000ff, + 0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, element_size_t::BYTE, - /*index=*/-3, + /*index=*/-2, }, { - "ld1sb scalar+scalar 32bit element", - TEST_FUNC("ld1sb z28.s, p1/z, [%[base], %[index]]"), - { /*zt=*/ { 28 }, /*pg=*/1 }, - std::array, 1> { - { 0x000005, 0x000006, 0x000007, 0x000008, 0x000009, 0x000010, 0x000011, - 0x000012, 0x000013, 0x000014, 0x000015, 0x000016, 0x000017, 0x000018, - 0x000019, 0x000020 } }, + "ldff1b scalar+scalar 64bit element", + TEST_FUNC("ldff1b z3.d, p4/z, [%[base], %[index]]"), + { /*zt=*/3, /*pg=*/4 }, + std::array { 0x0000000000000003, 0x0000000000000004, + 0x0000000000000005, 0x0000000000000006, + 0x0000000000000007, 0x0000000000000008, + 0x0000000000000009, 0x0000000000000010 }, + std::array { 0x00000000000000ff, 0x0000000000000000 }, + std::array { 0x00000000000000ff, 0x00000000000000ff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0x00000000000000ff, 0x00000000000000ff, + 0x00000000000000ff, 0x00000000000000ff, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, element_size_t::BYTE, - /*index=*/5, + /*index=*/3, }, + // LDFF1SB instructions { - "ld1sb scalar+scalar 64bit element", - TEST_FUNC("ld1sb z31.d, p0/z, [%[base], %[index]]"), - { /*zt=*/ { 31 }, /*pg=*/0 }, - std::array, 1> { { -12, -13, -14, -15, 0, 1, 2, 3 } }, + "ldff1sb scalar+scalar 16bit element", + TEST_FUNC("ldff1sb z4.h, p3/z, [%[base], %[index]]"), + { /*zt=*/4, /*pg=*/3 }, + std::array { + -12, -13, -14, -15, 0x0000, 0x0001, 0x0002, 0x0003, + 0x0004, 0x0005, 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, + 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, + 0x0020, 0x0021, 0x0022, 0x0023, -8, -9, -10, -11 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -15, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, + std::array { -14, -1, -1, -1, -1, -1, -1, -1, -15, -1, -1, + -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, element_size_t::BYTE, - /*index=*/28, + /*index=*/-4, }, - // LD1H { - "ld1h scalar+scalar 16bit element", - TEST_FUNC("ld1h z27.h, p1/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 27 }, /*pg=*/1 }, - std::array, 1> { - { 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, - 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, - 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, - 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005 } }, - element_size_t::HALF, - /*index=*/6, + "ldff1sb scalar+scalar 32bit element", + TEST_FUNC("ldff1sb z5.s, p2/z, [%[base], %[index]]"), + { /*zt=*/5, /*pg=*/2 }, + std::array { 0x00000005, 0x00000006, 0x00000007, 0x00000008, + 0x00000009, 0x00000010, 0x00000011, 0x00000012, + 0x00000013, 0x00000014, 0x00000015, 0x00000016, + 0x00000017, 0x00000018, 0x00000019, 0x00000020 }, + std::array { -1, -1, 0, 0 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -15, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, + element_size_t::BYTE, + /*index=*/5, }, { - "ld1h scalar+scalar 32bit element", - TEST_FUNC("ld1h z23.s, p2/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 23 }, /*pg=*/2 }, - std::array, 1> { - { 0x00000009, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, - 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x00000020, - 0x00000021, 0x00000022, 0x00000023, 0x0000fff8 } }, - element_size_t::HALF, - /*index=*/9, + "ldff1sb scalar+scalar 64bit element", + TEST_FUNC("ldff1sb z6.d, p1/z, [%[base], %[index]]"), + { /*zt=*/6, /*pg=*/1 }, + std::array { -10, -11, -12, -13, -14, -15, 0x0000000000000000, + 0x0000000000000001 }, + std::array { -1, 0 }, + std::array { -1, -1, 0, 0 }, + std::array { -1, -1, -1, -1, 0, 0, 0, 0 }, + element_size_t::BYTE, + /*index=*/-6, }, + // LDFF1H instructions { - "ld1h scalar+scalar 64bit element", - TEST_FUNC("ld1h z19.d, p3/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 19 }, /*pg=*/3 }, - std::array, 1> { - { 0x000000000000fff2, 0x000000000000fff1, 0x0000000000000000, - 0x0000000000000001, 0x0000000000000002, 0x0000000000000003, - 0x0000000000000004, 0x0000000000000005 } }, + "ldff1h scalar+scalar 16bit element", + TEST_FUNC("ldff1h z7.h, p0/z, [%[base], %[index], lsl #1]"), + { /*zt=*/7, /*pg=*/0 }, + std::array { + 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, 0x0014, + 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, 0x0022, + 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, 0xfff2, + 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005, 0x0006 }, + std::array { 0xfff1, 0xffff, 0xffff, 0xffff, 0x0000, 0x0000, + 0x0000, 0x0000 }, + std::array { 0xfff2, 0xffff, 0xffff, 0xffff, 0xfff1, 0xffff, + 0xffff, 0xffff, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000 }, + std::array { + 0xfff4, 0xffff, 0xffff, 0xffff, 0xfff3, 0xffff, 0xffff, 0xffff, + 0xfff2, 0xffff, 0xffff, 0xffff, 0xfff1, 0xffff, 0xffff, 0xffff, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, + 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000 }, element_size_t::HALF, - /*index=*/-2, + /*index=*/7, }, { - "ldnt1h scalar+scalar", - TEST_FUNC("ldnt1h z15.h, p4/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 15 }, /*pg=*/4 }, - std::array, 1> { - { 0x0006, 0x0007, 0x0008, 0x0009, 0x0010, 0x0011, 0x0012, 0x0013, - 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, 0x0020, 0x0021, - 0x0022, 0x0023, 0xfff8, 0xfff7, 0xfff6, 0xfff5, 0xfff4, 0xfff3, - 0xfff2, 0xfff1, 0x0000, 0x0001, 0x0002, 0x0003, 0x0004, 0x0005 } }, + "ldff1h scalar+scalar 32bit element", + TEST_FUNC("ldff1h z8.s, p1/z, [%[base], %[index], lsl #1]"), + { /*zt=*/8, /*pg=*/1 }, + std::array { 0x0000fff8, 0x0000fff7, 0x0000fff6, 0x0000fff5, + 0x0000fff4, 0x0000fff3, 0x0000fff2, 0x0000fff1, + 0x00000000, 0x00000001, 0x00000002, 0x00000003, + 0x00000004, 0x00000005, 0x00000006, 0x00000007 }, + std::array { 0x0000ffff, 0x0000ffff, 0x00000000, 0x00000000 }, + std::array { 0x0000fff1, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0x0000fff2, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x0000fff1, 0x0000ffff, 0x0000ffff, 0x0000ffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, element_size_t::HALF, - /*index=*/6, + /*index=*/-8, }, - // LD1SH { - "ld1sh scalar+scalar 32bit element", - TEST_FUNC("ld1sh z11.s, p5/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 11 }, /*pg=*/5 }, - std::array, 1> { - { 0x00000009, 0x00000010, 0x00000011, 0x00000012, 0x00000013, 0x00000014, - 0x00000015, 0x00000016, 0x00000017, 0x00000018, 0x00000019, 0x00000020, - 0x00000021, 0x00000022, 0x00000023, 0xfffffff8 } }, + "ldff1h scalar+scalar 64bit element", + TEST_FUNC("ldff1h z9.d, p2/z, [%[base], %[index], lsl #1]"), + { /*zt=*/9, /*pg=*/2 }, + std::array { 0x0000000000000009, 0x0000000000000010, + 0x0000000000000011, 0x0000000000000012, + 0x0000000000000013, 0x0000000000000014, + 0x0000000000000015, 0x0000000000000016 }, + std::array { 0x000000000000ffff, 0x0000000000000000 }, + std::array { 0x000000000000ffff, 0x000000000000ffff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0x000000000000fff1, 0x000000000000ffff, + 0x00000000000fffff, 0x000000000000ffff, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, element_size_t::HALF, /*index=*/9, }, + // LDFF1SH instructions { - "ld1sh scalar+scalar 64bit element", - TEST_FUNC("ld1sh z7.d, p6/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 7 }, /*pg=*/6 }, - std::array, 1> { - { 0xfffffffffffffff2, 0xfffffffffffffff1, 0x0000000000000000, - 0x0000000000000001, 0x0000000000000002, 0x0000000000000003, - 0x0000000000000004, 0x0000000000000005 } }, + "ldff1sh scalar+scalar 32bit element", + TEST_FUNC("ldff1sh z10.s, p3/z, [%[base], %[index], lsl #1]"), + { /*zt=*/10, /*pg=*/3 }, + std::array { 0x00000022, 0x00000023, -8, -9, -10, -11, -12, -13, + -14, -15, 0x00000000, 0x00000001, 0x00000002, + 0x00000003, 0x00000004, 0x00000005 }, + std::array { -1, -1, 0, 0 }, + std::array { -15, -1, -1, -1, 0, 0, 0, 0 }, + std::array { -14, -1, -1, -1, -15, -1, -1, -1, 0, 0, 0, 0, 0, 0, + 0, 0 }, element_size_t::HALF, - /*index=*/-2, - }, - // LD1W - { - "ld1w scalar+scalar 32bit element", - TEST_FUNC("ld1w z3.s, p7/z, [%[base], %[index], lsl #2]"), - { /*zt=*/ { 3 }, /*pg=*/7 }, - std::array, 1> { - { 0x00000017, 0x00000018, 0x00000019, 0x00000020, 0x00000021, 0x00000022, - 0x00000023, 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, 0xfffffff4, - 0xfffffff3, 0xfffffff2, 0xfffffff1, 0x00000000 } }, - element_size_t::SINGLE, - /*index=*/17, + /*index=*/-10, }, { - "ld1w scalar+scalar 64bit element", - TEST_FUNC("ld1w z1.d, p6/z, [%[base], %[index], lsl #2]"), - { /*zt=*/ { 1 }, /*pg=*/6 }, - std::array, 1> { - { 0x00000000fffffff1, 0x0000000000000000, 0x0000000000000001, - 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, - 0x0000000000000005, 0x0000000000000006 } }, - element_size_t::SINGLE, - /*index=*/-1, + "ldff1sh scalar+scalar 64bit element", + TEST_FUNC("ldff1sh z11.d, p4/z, [%[base], %[index], lsl #1]"), + { /*zt=*/11, /*pg=*/4 }, + std::array { 0x0000000000000011, 0x0000000000000012, + 0x0000000000000013, 0x0000000000000014, + 0x0000000000000015, 0x0000000000000016, + 0x0000000000000017, 0x0000000000000018 }, + std::array { -1, 0 }, + std::array { -1, -1, 0, 0 }, + std::array { -15, -1, -1, -1, 0, 0, 0, 0 }, + element_size_t::HALF, + /*index=*/11, }, + // LDFF1W instructions { - "ldnt1w scalar+scalar", - TEST_FUNC("ldnt1w z5.s, p5/z, [%[base], %[index], lsl #2]"), - { /*zt=*/ { 5 }, /*pg=*/5 }, - std::array, 1> { - { 0x00000018, 0x00000019, 0x00000020, 0x00000021, 0x00000022, 0x00000023, - 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, 0xfffffff4, 0xfffffff3, - 0xfffffff2, 0xfffffff1, 0x00000000, 0x00000001 } }, + "ldff1w scalar+scalar 32bit element", + TEST_FUNC("ldff1w z12.s, p5/z, [%[base], %[index], lsl #2]"), + { /*zt=*/12, /*pg=*/5 }, + std::array { 0x00000020, 0x00000021, 0x00000022, 0x00000023, + 0xfffffff8, 0xfffffff7, 0xfffffff6, 0xfffffff5, + 0xfffffff4, 0xfffffff3, 0xfffffff2, 0xfffffff1, + 0x00000000, 0x00000001, 0x00000002, 0x00000003 }, + std::array { 0xfffffff1, 0xffffffff, 0x00000000, 0x00000000 }, + std::array { 0xfffffff2, 0xffffffff, 0xfffffff1, 0xffffffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + std::array { 0xfffffff4, 0xffffffff, 0xfffffff3, 0xffffffff, + 0xfffffff2, 0xffffffff, 0xfffffff1, 0xffffffff, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, element_size_t::SINGLE, - /*index=*/18, + /*index=*/-12, }, - // LD1SW { - "ld1sw scalar+scalar", - TEST_FUNC("ld1sw z9.d, p4/z, [%[base], %[index], lsl #2]"), - { /*zt=*/ { 9 }, /*pg=*/4 }, - std::array, 1> { - { 0xfffffffffffffff1, 0x0000000000000000, 0x0000000000000001, - 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, - 0x0000000000000005, 0x0000000000000006 } }, + "ldff1w scalar+scalar 64bit element", + TEST_FUNC("ldff1w z13.d, p6/z, [%[base], %[index], lsl #2]"), + { /*zt=*/13, /*pg=*/6 }, + std::array { 0x0000000000000013, 0x0000000000000014, + 0x0000000000000015, 0x0000000000000016, + 0x0000000000000017, 0x0000000000000018, + 0x0000000000000019, 0x0000000000000020 }, + std::array { 0x0000000fffffffff, 0x0000000000000000 }, + std::array { 0x0000000ffffffff1, 0x0000000fffffffff, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0x0000000ffffffff2, 0x000000000fffffff, + 0x00000000fffffff1, 0x0000000ffffffff6, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, element_size_t::SINGLE, - /*index=*/-1, - }, - // LD1D - { - "ld1d scalar+scalar", - TEST_FUNC("ld1d z13.d, p3/z, [%[base], %[index], lsl #3]"), - { /*zt=*/ { 13 }, /*pg=*/3 }, - std::array, 1> { - { 0x0000000000000008, 0x0000000000000009, 0x0000000000000010, - 0x0000000000000011, 0x0000000000000012, 0x0000000000000013, - 0x0000000000000014, 0x0000000000000015 } }, - element_size_t::DOUBLE, - /*index=*/8, - }, - { - "ldnt1d scalar+scalar", - TEST_FUNC("ldnt1d z17.d, p2/z, [%[base], %[index], lsl #3]"), - { /*zt=*/ { 17 }, /*pg=*/2 }, - std::array, 1> { - { 0x0000000000000002, 0x0000000000000003, 0x0000000000000004, - 0x0000000000000005, 0x0000000000000006, 0x0000000000000007, - 0x0000000000000008, 0x0000000000000009 } }, - element_size_t::DOUBLE, - /*index=*/2, - }, - // Load and replicate instructions - { - "ld1rqb scalar+scalar", - TEST_FUNC("ld1rqb z21.b, p1/z, [%[base], %[index]]"), - { /*zt=*/ { 21 }, /*pg=*/1 }, - std::array, 1> { - { 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, - 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, - 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21, 0x06, - 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, - 0x18, 0x19, 0x20, 0x21, 0x06, 0x07, 0x08, 0x09, 0x10, 0x11, 0x12, - 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x20, 0x21 } }, - element_size_t::BYTE, - /*index=*/6, - /*loaded_vector_size=*/16, - }, - { - "ld1rqh scalar+scalar", - TEST_FUNC("ld1rqh z25.h, p0/z, [%[base], %[index], lsl #1]"), - { /*zt=*/ { 25 }, /*pg=*/0 }, - std::array, 1> { - { 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, - 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, - 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019, - 0x0012, 0x0013, 0x0014, 0x0015, 0x0016, 0x0017, 0x0018, 0x0019 } }, - element_size_t::HALF, - /*index=*/12, - /*loaded_vector_size=*/16, + /*index=*/13, }, + // LDFF1SW instructions { - "ld1rqw scalar+scalar", - TEST_FUNC("ld1rqw z29.s, p1/z, [%[base], %[index], lsl #2]"), - { /*zt=*/ { 29 }, /*pg=*/1 }, - std::array, 1> { - { 0x00000020, 0x00000021, 0x00000022, 0x00000023, 0x00000020, 0x00000021, - 0x00000022, 0x00000023, 0x00000020, 0x00000021, 0x00000022, 0x00000023, - 0x00000020, 0x00000021, 0x00000022, 0x00000023 } }, + "ldff1sw scalar+scalar", + TEST_FUNC("ldff1sw z14.d, p7/z, [%[base], %[index], lsl #2]"), + { /*zt=*/14, /*pg=*/7 }, + std::array { 0x0000000000000018, 0x0000000000000019, + 0x0000000000000020, 0x0000000000000021, + 0x0000000000000022, 0x0000000000000023, -8, -9 }, + std::array { -1, 0 }, + std::array { -15, -1, 0, 0 }, + std::array { -14, -1, -15, -1, 0, 0, 0, 0 }, element_size_t::SINGLE, - /*index=*/-12, - /*loaded_vector_size=*/16, + /*index=*/-14, }, + // LDFF1D instructions { - "ld1rqd scalar+scalar", - TEST_FUNC("ld1rqd z31.d, p2/z, [%[base], %[index], lsl #3]"), - { /*zt=*/ { 31 }, /*pg=*/2 }, - std::array, 1> { - { 0xfffffffffffffff6, 0xfffffffffffffff5, 0xfffffffffffffff6, - 0xfffffffffffffff5, 0xfffffffffffffff6, 0xfffffffffffffff5, - 0xfffffffffffffff6, 0xfffffffffffffff5 } }, + "ldff1d scalar+scalar", + TEST_FUNC("ldff1d z15.d, p6/z, [%[base], %[index], lsl #3]"), + { /*zt=*/15, /*pg=*/6 }, + std::array { 0x0000000000000015, 0x0000000000000016, + 0x0000000000000017, 0x0000000000000018, + 0x0000000000000019, 0x0000000000000020, + 0x0000000000000021, 0x0000000000000022 }, + std::array { 0xfffffffffffffff1, 0x0000000000000000 }, + std::array { 0xfffffffffffffff2, 0xfffffffffffffff1, + 0x0000000000000000, 0x0000000000000000 }, + std::array { 0xfffffffffffffff4, 0xfffffffffffffff3, + 0xfffffffffffffff2, 0xfffffffffffffff1, + 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000 }, element_size_t::DOUBLE, - /*index=*/-6, - /*loaded_vector_size=*/16, + /*index=*/15, }, }); # undef TEST_FUNC @@ -5688,7 +7116,8 @@ struct scalar_plus_immediate_non_fault_load_test_case_t } void - check_fault(bool expected_fault, bool signal_handler_called) override + check_fault(predicate_reg_value128_t pred, bool expected_fault, + size_t faulting_element, bool signal_handler_called) override { // Non-fault instructions should never trigger the signal handler. if (signal_handler_called) { @@ -7291,14 +8720,20 @@ main(int argc, char **argv) #if defined(__ARM_FEATURE_SVE) if (test_ld1_scalar_plus_vector() == FAIL) status = FAIL; + if (test_ldff1_scalar_plus_vector() == FAIL) + status = FAIL; if (test_st1_scalar_plus_vector() == FAIL) status = FAIL; if (test_ld1_vector_plus_immediate() == FAIL) status = FAIL; + if (test_ldff1_vector_plus_immediate() == FAIL) + status = FAIL; if (test_st1_vector_plus_immediate() == FAIL) status = FAIL; if (test_ld1_scalar_plus_scalar() == FAIL) status = FAIL; + if (test_ldff1_scalar_plus_scalar() == FAIL) + status = FAIL; if (test_ld2_scalar_plus_scalar() == FAIL) status = FAIL; if (test_ld3_scalar_plus_scalar() == FAIL) diff --git a/suite/tests/client-interface/drx-scattergather-aarch64.templatex b/suite/tests/client-interface/drx-scattergather-aarch64.templatex index 057a4907ac4..fb9fe36913b 100644 --- a/suite/tests/client-interface/drx-scattergather-aarch64.templatex +++ b/suite/tests/client-interface/drx-scattergather-aarch64.templatex @@ -58,6 +58,65 @@ ld1d scalar\+vector 32bit unpacked unscaled offset sxtw: PASS ld1d scalar\+vector 64bit scaled offset: PASS ld1d scalar\+vector 64bit unscaled offset: PASS ld1d scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1b scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1b scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1b scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1b scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1b scalar\+vector 64bit unscaled offset: PASS +ldff1b scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sb scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1sb scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1sb scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sb scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sb scalar\+vector 64bit unscaled offset: PASS +ldff1sb scalar\+vector 64bit unscaled offset: PASS +ldff1h scalar\+vector 32bit scaled offset uxtw: PASS +ldff1h scalar\+vector 32bit scaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1h scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1h scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1h scalar\+vector 64bit scaled offset: PASS +ldff1h scalar\+vector 64bit unscaled offset: PASS +ldff1h scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sh scalar\+vector 32bit scaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit scaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sh scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1sh scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1sh scalar\+vector 64bit scaled offset: PASS +ldff1sh scalar\+vector 64bit unscaled offset: PASS +ldff1sh scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1w scalar\+vector 32bit scaled offset uxtw: PASS +ldff1w scalar\+vector 32bit scaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1w scalar\+vector 32bit unscaled offset uxtw: PASS +ldff1w scalar\+vector 32bit unscaled offset sxtw: PASS +ldff1w scalar\+vector 64bit scaled offset: PASS +ldff1w scalar\+vector 64bit unscaled offset: PASS +ldff1w scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1sw scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1sw scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1sw scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1sw scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1sw scalar\+vector 64bit scaled offset: PASS +ldff1sw scalar\+vector 64bit unscaled offset: PASS +ldff1sw scalar\+vector 64bit unscaled offset Zt==Zm: PASS +ldff1d scalar\+vector 32bit unpacked scaled offset uxtw: PASS +ldff1d scalar\+vector 32bit unpacked scaled offset sxtw: PASS +ldff1d scalar\+vector 32bit unpacked unscaled offset uxtw: PASS +ldff1d scalar\+vector 32bit unpacked unscaled offset sxtw: PASS +ldff1d scalar\+vector 64bit scaled offset: PASS +ldff1d scalar\+vector 64bit unscaled offset: PASS +ldff1d scalar\+vector 64bit unscaled offset Zt==Zm: PASS st1b scalar\+vector 32bit unpacked unscaled offset uxtw: PASS st1b scalar\+vector 32bit unpacked unscaled offset sxtw: PASS st1b scalar\+vector 32bit unscaled offset uxtw: PASS @@ -112,6 +171,21 @@ ld1sw vector\+immediate 64bit element \(max index\): PASS ld1d vector\+immediate 64bit element: PASS ld1d vector\+immediate 64bit element \(max index\): PASS ld1d vector\+immediate 64bit element Zt==Zn: PASS +ldff1b vector\+immediate 64bit element: PASS +ldff1b vector\+immediate 64bit element \(max index\): PASS +ldff1sb vector\+immediate 64bit element: PASS +ldff1sb vector\+immediate 64bit element \(max index\): PASS +ldff1h vector\+immediate 64bit element: PASS +ldff1h vector\+immediate 64bit element \(max index\): PASS +ldff1sh vector\+immediate 64bit element: PASS +ldff1sh vector\+immediate 64bit element \(max index\): PASS +ldff1w vector\+immediate 64bit element: PASS +ldff1w vector\+immediate 64bit element \(max index\): PASS +ldff1sw vector\+immediate 64bit element: PASS +ldff1sw vector\+immediate 64bit element \(max index\): PASS +ldff1d vector\+immediate 64bit element: PASS +ldff1d vector\+immediate 64bit element \(max index\): PASS +ldff1d vector\+immediate 64bit element Zt==Zn: PASS st1b vector\+immediate 64bit element: PASS st1b vector\+immediate 64bit element \(max index\): PASS st1b vector\+immediate 64bit element \(repeated base\): PASS @@ -148,6 +222,22 @@ ld1rqb scalar\+scalar: PASS ld1rqh scalar\+scalar: PASS ld1rqw scalar\+scalar: PASS ld1rqd scalar\+scalar: PASS +ldff1b scalar\+scalar 8bit element: PASS +ldff1b scalar\+scalar 16bit element: PASS +ldff1b scalar\+scalar 32bit element: PASS +ldff1b scalar\+scalar 64bit element: PASS +ldff1sb scalar\+scalar 16bit element: PASS +ldff1sb scalar\+scalar 32bit element: PASS +ldff1sb scalar\+scalar 64bit element: PASS +ldff1h scalar\+scalar 16bit element: PASS +ldff1h scalar\+scalar 32bit element: PASS +ldff1h scalar\+scalar 64bit element: PASS +ldff1sh scalar\+scalar 32bit element: PASS +ldff1sh scalar\+scalar 64bit element: PASS +ldff1w scalar\+scalar 32bit element: PASS +ldff1w scalar\+scalar 64bit element: PASS +ldff1sw scalar\+scalar: PASS +ldff1d scalar\+scalar: PASS ld2b scalar\+scalar: PASS ld2h scalar\+scalar: PASS ld2w scalar\+scalar: PASS @@ -312,9 +402,9 @@ stnt1d vector\+scalar 64bit unscaled offset \(repeated base\): PASS #endif /* __ARM_FEATURE_SVE2 */ #ifndef TEST_SAMPLE_CLIENT #if defined(__ARM_FEATURE_SVE2) -event_exit, 3936 scatter/gather instructions +event_exit, 5144 scatter/gather instructions #elif defined( __ARM_FEATURE_SVE) -event_exit, 3756 scatter/gather instructions +event_exit, 4964 scatter/gather instructions #else event_exit, 0 scatter/gather instructions #endif /* __ARM_FEATURE_SVE */