@@ -119,6 +119,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
119119 }
120120
121121#if defined(GGML_SIMD )
122+ #if defined(__riscv_v_intrinsic )
123+ // todo: RVV impl
124+ for (int i = 0 ; i < n ; ++ i ) {
125+ for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
126+ sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
127+ }
128+ }
129+ #else
122130 const int np = (n & ~(GGML_F16_STEP - 1 ));
123131
124132 GGML_F16_VEC sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] = { { GGML_F16_VEC_ZERO } };
@@ -149,6 +157,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
149157 sumf [j ] += (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
150158 }
151159 }
160+ #endif
152161#else
153162 for (int i = 0 ; i < n ; ++ i ) {
154163 for (int j = 0 ; j < GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -243,6 +252,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
243252
244253 svst1_f32 (pg , y + np2 , ay1 );
245254 }
255+ #elif defined(__riscv_v_intrinsic )
256+ for (int i = 0 , avl ; i < n ; i += avl ) {
257+ avl = __riscv_vsetvl_e32m8 (n - i );
258+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
259+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
260+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
261+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
262+ }
246263 #else
247264 const int np = (n & ~(GGML_F32_STEP - 1 ));
248265
@@ -276,6 +293,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
276293
277294inline static void ggml_vec_mad_f16 (const int n , ggml_fp16_t * GGML_RESTRICT y , const ggml_fp16_t * GGML_RESTRICT x , const float v ) {
278295#if defined(GGML_SIMD )
296+ #if defined(__riscv_v_intrinsic )
297+ // todo: RVV impl
298+ // scalar
299+ for (int i = 0 ; i < n ; ++ i ) {
300+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
301+ }
302+ #else
279303 const int np = (n & ~(GGML_F16_STEP - 1 ));
280304
281305 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -297,6 +321,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
297321 for (int i = np ; i < n ; ++ i ) {
298322 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) + GGML_CPU_FP16_TO_FP32 (x [i ])* v );
299323 }
324+ #endif
300325#else
301326 // scalar
302327 for (int i = 0 ; i < n ; ++ i ) {
@@ -324,6 +349,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
324349 y [i ] += x [k ][i ]* v [k ][0 ];
325350 }
326351 }
352+ #elif defined(__riscv_v_intrinsic )
353+ for (int i = 0 , avl ; i < n ; i += avl ) {
354+ avl = __riscv_vsetvl_e32m8 (n - i );
355+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
356+ for (int k = 0 ; k < GGML_VEC_MAD_UNROLL ; k ++ ) {
357+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
358+ ay = __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
359+ }
360+ __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
361+ }
327362 #else
328363 const int np = (n & ~(GGML_F32_STEP - 1 ));
329364
@@ -375,6 +410,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
375410 for (int i = 0 ; i < n ; ++ i ) {
376411 y [i ] = x [i ]* s + b ;
377412 }
413+ #elif defined(__riscv_v_intrinsic )
414+ for (int i = 0 , avl ; i < n ; i += avl ) {
415+ avl = __riscv_vsetvl_e32m8 (n - i );
416+ vfloat32m8_t ax = __riscv_vle32_v_f32m8 (& x [i ], avl );
417+ vfloat32m8_t vb = __riscv_vfmv_v_f_f32m8 (b , avl );
418+ vfloat32m8_t ny = __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
419+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
420+ }
378421 #else
379422 const int np = (n & ~(GGML_F32_STEP - 1 ));
380423
@@ -436,6 +479,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
436479 ay1 = svmul_f32_m (pg , ay1 , vx );
437480 svst1_f32 (pg , y + np , ay1 );
438481 }
482+ #elif defined(__riscv_v_intrinsic )
483+ for (int i = 0 , avl ; i < n ; i += avl ) {
484+ avl = __riscv_vsetvl_e32m8 (n - i );
485+ vfloat32m8_t ay = __riscv_vle32_v_f32m8 (& y [i ], avl );
486+ vfloat32m8_t ny = __riscv_vfmul_vf_f32m8 (ay , v , avl );
487+ __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
488+ }
439489 #else
440490 const int np = (n & ~(GGML_F32_STEP - 1 ));
441491
@@ -467,6 +517,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float v) {
467517
468518inline static void ggml_vec_scale_f16 (const int n , ggml_fp16_t * y , const float v ) {
469519#if defined(GGML_SIMD )
520+ #if defined(__riscv_v_intrinsic )
521+ // todo: RVV impl
522+ // scalar
523+ for (int i = 0 ; i < n ; ++ i ) {
524+ y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
525+ }
526+ #else
470527 const int np = (n & ~(GGML_F16_STEP - 1 ));
471528
472529 GGML_F16_VEC vx = GGML_F16_VEC_SET1 (v );
@@ -486,6 +543,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
486543 for (int i = np ; i < n ; ++ i ) {
487544 y [i ] = GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
488545 }
546+ #endif
489547#else
490548 // scalar
491549 for (int i = 0 ; i < n ; ++ i ) {
0 commit comments