@@ -1121,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
11211121#define GGML_F32x4_ADD vaddq_f32
11221122#define GGML_F32x4_MUL vmulq_f32
11231123#define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
1124- #define GGML_F32x4_REDUCE(res, x) \
1125- { \
1126- int offset = GGML_F32_ARR >> 1; \
1127- for (int i = 0; i < offset; ++i) { \
1128- x [i] = vaddq_f32(x [i], x [offset+i]); \
1129- } \
1130- offset >>= 1; \
1131- for (int i = 0; i < offset; ++i) { \
1132- x [i] = vaddq_f32(x [i], x [offset+i]); \
1133- } \
1134- offset >>= 1; \
1135- for (int i = 0; i < offset; ++i) { \
1136- x [i] = vaddq_f32(x [i], x [offset+i]); \
1137- } \
1138- res = GGML_F32x4_REDUCE_ONE(x [0]); \
1124+ #define GGML_F32x4_REDUCE(res, x) \
1125+ { \
1126+ int offset = GGML_F32_ARR >> 1; \
1127+ for (int i = 0; i < offset; ++i) { \
1128+ (x) [i] = vaddq_f32((x) [i], (x) [offset+i]); \
1129+ } \
1130+ offset >>= 1; \
1131+ for (int i = 0; i < offset; ++i) { \
1132+ (x) [i] = vaddq_f32((x) [i], (x) [offset+i]); \
1133+ } \
1134+ offset >>= 1; \
1135+ for (int i = 0; i < offset; ++i) { \
1136+ (x) [i] = vaddq_f32((x) [i], (x) [offset+i]); \
1137+ } \
1138+ ( res) = GGML_F32x4_REDUCE_ONE((x) [0]); \
11391139}
11401140
11411141#define GGML_F32_VEC GGML_F32x4
@@ -1162,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
11621162 #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
11631163 #define GGML_F16x8_ADD vaddq_f16
11641164 #define GGML_F16x8_MUL vmulq_f16
1165- #define GGML_F16x8_REDUCE(res, x) \
1166- do { \
1167- int offset = GGML_F16_ARR >> 1; \
1168- for (int i = 0; i < offset; ++i) { \
1169- x [i] = vaddq_f16(x [i], x [offset+i]); \
1170- } \
1171- offset >>= 1; \
1172- for (int i = 0; i < offset; ++i) { \
1173- x [i] = vaddq_f16(x [i], x [offset+i]); \
1174- } \
1175- offset >>= 1; \
1176- for (int i = 0; i < offset; ++i) { \
1177- x [i] = vaddq_f16(x [i], x [offset+i]); \
1178- } \
1179- const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x [0])); \
1180- const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x [0])); \
1181- res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
1165+ #define GGML_F16x8_REDUCE(res, x) \
1166+ do { \
1167+ int offset = GGML_F16_ARR >> 1; \
1168+ for (int i = 0; i < offset; ++i) { \
1169+ (x) [i] = vaddq_f16((x) [i], (x) [offset+i]); \
1170+ } \
1171+ offset >>= 1; \
1172+ for (int i = 0; i < offset; ++i) { \
1173+ (x) [i] = vaddq_f16((x) [i], (x) [offset+i]); \
1174+ } \
1175+ offset >>= 1; \
1176+ for (int i = 0; i < offset; ++i) { \
1177+ (x) [i] = vaddq_f16((x) [i], (x) [offset+i]); \
1178+ } \
1179+ const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x) [0])); \
1180+ const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x) [0])); \
1181+ ( res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1)); \
11821182 } while (0)
11831183
11841184 #define GGML_F16_VEC GGML_F16x8
11851185 #define GGML_F16_VEC_ZERO GGML_F16x8_ZERO
11861186 #define GGML_F16_VEC_SET1 GGML_F16x8_SET1
11871187 #define GGML_F16_VEC_LOAD(p, i) GGML_F16x8_LOAD(p)
1188- #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r [i])
1188+ #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r) [i])
11891189 #define GGML_F16_VEC_FMA GGML_F16x8_FMA
11901190 #define GGML_F16_VEC_ADD GGML_F16x8_ADD
11911191 #define GGML_F16_VEC_MUL GGML_F16x8_MUL
0 commit comments