@@ -104,6 +104,14 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
104104    }
105105
106106#if  defined(GGML_SIMD )
107+ #if  defined(__riscv_v_intrinsic )
108+     // todo: RVV impl 
109+     for  (int  i  =  0 ; i  <  n ; ++ i ) {
110+         for  (int  j  =  0 ; j  <  GGML_VEC_DOT_UNROLL ; ++ j ) {
111+             sumf [j ] +=  (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
112+         }
113+     }
114+ #else 
107115    const  int  np  =  (n  &  ~(GGML_F16_STEP  -  1 ));
108116
109117    GGML_F16_VEC  sum [GGML_VEC_DOT_UNROLL ][GGML_F16_ARR ] =  { { GGML_F16_VEC_ZERO  } };
@@ -134,6 +142,7 @@ inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * GG
134142            sumf [j ] +=  (ggml_float )(GGML_CPU_FP16_TO_FP32 (x [j ][i ])* GGML_CPU_FP16_TO_FP32 (y [i ]));
135143        }
136144    }
145+ #endif 
137146#else 
138147    for  (int  i  =  0 ; i  <  n ; ++ i ) {
139148        for  (int  j  =  0 ; j  <  GGML_VEC_DOT_UNROLL ; ++ j ) {
@@ -228,6 +237,14 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
228237
229238            svst1_f32 (pg , y  +  np2 , ay1 );
230239        }
240+     #elif  defined(__riscv_v_intrinsic )
241+         for  (int  i  =  0 , avl ; i  <  n ; i  +=  avl ) {
242+             avl  =  __riscv_vsetvl_e32m8 (n  -  i );
243+             vfloat32m8_t  ax  =  __riscv_vle32_v_f32m8 (& x [i ], avl );
244+             vfloat32m8_t  ay  =  __riscv_vle32_v_f32m8 (& y [i ], avl );
245+             vfloat32m8_t  ny  =  __riscv_vfmadd_vf_f32m8 (ax , v , ay , avl );
246+             __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
247+         }
231248    #else 
232249        const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
233250
@@ -261,6 +278,13 @@ inline static void ggml_vec_mad_f32(const int n, float * GGML_RESTRICT y, const
261278
262279inline  static  void  ggml_vec_mad_f16 (const  int  n , ggml_fp16_t  *  GGML_RESTRICT  y , const  ggml_fp16_t  *  GGML_RESTRICT  x , const  float  v ) {
263280#if  defined(GGML_SIMD )
281+ #if  defined(__riscv_v_intrinsic )
282+     // todo: RVV impl 
283+     // scalar 
284+     for  (int  i  =  0 ; i  <  n ; ++ i ) {
285+         y [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) +  GGML_CPU_FP16_TO_FP32 (x [i ])* v );
286+     }
287+ #else 
264288    const  int  np  =  (n  &  ~(GGML_F16_STEP  -  1 ));
265289
266290    GGML_F16_VEC  vx  =  GGML_F16_VEC_SET1 (v );
@@ -282,6 +306,7 @@ inline static void ggml_vec_mad_f16(const int n, ggml_fp16_t * GGML_RESTRICT y,
282306    for  (int  i  =  np ; i  <  n ; ++ i ) {
283307        y [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ]) +  GGML_CPU_FP16_TO_FP32 (x [i ])* v );
284308    }
309+ #endif 
285310#else 
286311    // scalar 
287312    for  (int  i  =  0 ; i  <  n ; ++ i ) {
@@ -309,6 +334,16 @@ inline static void ggml_vec_mad_f32_unroll(const int n, const int xs, const int
309334                y [i ] +=  x [k ][i ]* v [k ][0 ];
310335            }
311336        }
337+     #elif  defined(__riscv_v_intrinsic )
338+         for  (int  i  =  0 , avl ; i  <  n ; i  +=  avl ) {
339+             avl  =  __riscv_vsetvl_e32m8 (n  -  i );
340+             vfloat32m8_t  ay  =  __riscv_vle32_v_f32m8 (& y [i ], avl );
341+             for  (int  k  =  0 ; k  <  GGML_VEC_MAD_UNROLL ; k ++ ) {
342+                 vfloat32m8_t  ax  =  __riscv_vle32_v_f32m8 (& x [k ][i ], avl );
343+                 ay  =  __riscv_vfmadd_vf_f32m8 (ax , v [k ][0 ], ay , avl );
344+             }
345+             __riscv_vse32_v_f32m8 (& y [i ], ay , avl );
346+         }
312347    #else 
313348        const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
314349
@@ -360,6 +395,14 @@ inline static void ggml_vec_mad1_f32(const int n, float * y, const float * x, co
360395        for  (int  i  =  0 ; i  <  n ; ++ i ) {
361396            y [i ] =  x [i ]* s  +  b ;
362397        }
398+     #elif  defined(__riscv_v_intrinsic )
399+         for  (int  i  =  0 , avl ; i  <  n ; i  +=  avl ) {
400+             avl  =  __riscv_vsetvl_e32m8 (n  -  i );
401+             vfloat32m8_t  ax  =  __riscv_vle32_v_f32m8 (& x [i ], avl );
402+             vfloat32m8_t  vb  =  __riscv_vfmv_v_f_f32m8 (b , avl );
403+             vfloat32m8_t  ny  =  __riscv_vfmadd_vf_f32m8 (ax , s , vb , avl );
404+             __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
405+         }
363406    #else 
364407        const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
365408
@@ -421,6 +464,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
421464            ay1  =  svmul_f32_m (pg , ay1 , vx );
422465            svst1_f32 (pg , y  +  np , ay1 );
423466        }
467+     #elif  defined(__riscv_v_intrinsic )
468+         for  (int  i  =  0 , avl ; i  <  n ; i  +=  avl ) {
469+             avl  =  __riscv_vsetvl_e32m8 (n  -  i );
470+             vfloat32m8_t  ay  =  __riscv_vle32_v_f32m8 (& y [i ], avl );
471+             vfloat32m8_t  ny  =  __riscv_vfmul_vf_f32m8 (ay , v , avl );
472+             __riscv_vse32_v_f32m8 (& y [i ], ny , avl );
473+         }
424474    #else 
425475        const  int  np  =  (n  &  ~(GGML_F32_STEP  -  1 ));
426476
@@ -452,6 +502,13 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
452502
453503inline  static  void  ggml_vec_scale_f16 (const  int  n , ggml_fp16_t  *  y , const  float  v ) {
454504#if  defined(GGML_SIMD )
505+ #if  defined(__riscv_v_intrinsic )
506+     // todo: RVV impl 
507+     // scalar 
508+     for  (int  i  =  0 ; i  <  n ; ++ i ) {
509+         y [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
510+     }
511+ #else 
455512    const  int  np  =  (n  &  ~(GGML_F16_STEP  -  1 ));
456513
457514    GGML_F16_VEC  vx  =  GGML_F16_VEC_SET1 (v );
@@ -471,6 +528,7 @@ inline static void ggml_vec_scale_f16(const int n, ggml_fp16_t * y, const float
471528    for  (int  i  =  np ; i  <  n ; ++ i ) {
472529        y [i ] =  GGML_CPU_FP32_TO_FP16 (GGML_CPU_FP16_TO_FP32 (y [i ])* v );
473530    }
531+ #endif 
474532#else 
475533    // scalar 
476534    for  (int  i  =  0 ; i  <  n ; ++ i ) {
0 commit comments