diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml
index 05e0487cf54..147e01f21ad 100644
--- a/.github/workflows/linux-riscv64-cpu-gcc.yml
+++ b/.github/workflows/linux-riscv64-cpu-gcc.yml
@@ -117,7 +117,7 @@ jobs:
       #id: cache-riscv
       #uses: actions/cache@v3
       #with:
-        #path: rv64gcv-install
+        #path: rv64gcv-install-next
         #key: rv64gcv-linux-install-20210504
 
     #- name: install-riscv-build-deps
@@ -132,31 +132,31 @@ jobs:
       #with:
         #repository: riscv/riscv-gnu-toolchain
         #path: riscv-gnu-toolchain
-        #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c
+        #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3
     #- name: checkout-riscv-gnu-toolchain-submodules
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
       #run: |
         #cd riscv-gnu-toolchain
+        #git submodule update --init --recursive --depth 1 glibc
+        #git submodule update --init --recursive --depth 1 newlib
         #git submodule update --init --recursive --depth 1 riscv-binutils
         #git submodule update --init --recursive --depth 1 riscv-gcc
-        #git submodule update --init --recursive --depth 1 riscv-glibc
         #git submodule update --init --recursive --depth 1 riscv-dejagnu
-        #git submodule update --init --recursive --depth 1 riscv-newlib
         #git submodule update --init --recursive --depth 1 riscv-gdb
     #- name: riscv-gnu-toolchain
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
       #run: |
         #cd riscv-gnu-toolchain
-        #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c
-        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh
+        #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c
+        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh
         #make linux
 
     #- name: riscv-strip-install
       #if: steps.cache-riscv.outputs.cache-hit != 'true'
-      #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true
+      #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true
 
     - name: configure
-      run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+      run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
     - name: build
       run: cmake --build build -j 4
 
@@ -164,10 +164,10 @@ jobs:
       run: |
         export PATH=/data/action/osd/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
 
     - name: test-vlen128
       run: |
         export PATH=/data/action/osd/qemu-install/bin:$PATH
         cd build
-        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml
new file mode 100644
index 00000000000..18ad114efa4
--- /dev/null
+++ b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml
@@ -0,0 +1,142 @@
+name: linux-riscv64-cpu-gnu-clang
+on:
+  push:
+    branches: [master]
+    paths:
+    - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml'
+    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/riscv/**'
+    - 'tests/**'
+  pull_request:
+    branches: [master]
+    paths:
+    - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml'
+    - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake'
+    - 'CMakeLists.txt'
+    - 'cmake/**'
+    - 'src/*'
+    - 'src/layer/*'
+    - 'src/layer/riscv/**'
+    - 'tests/**'
+concurrency:
+  group: linux-riscv64-cpu-gnu-clang-${{ github.ref }}
+  cancel-in-progress: true
+permissions:
+  contents: read
+
+jobs:
+  linux-gcc-riscv64-rvv:
+    runs-on: [self-hosted, linux, centos]
+    steps:
+    - uses: actions/checkout@v3
+
+    #- name: cache-qemu
+      #id: cache-qemu
+      #uses: actions/cache@v3
+      #with:
+        #path: qemu-install
+        #key: qemu-riscv64-install-20220502-3
+    #- name: install-qemu-build-deps
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #run: |
+        #sudo apt-get update
+        #sudo apt-get install autoconf automake autotools-dev ninja-build
+    #- name: checkout-qemu
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #uses: actions/checkout@v3
+      #with:
+        #repository: qemu/qemu
+        #path: qemu
+        #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65
+    #- name: qemu
+      #if: steps.cache-qemu.outputs.cache-hit != 'true'
+      #run: |
+        #cd qemu
+        #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch
+        #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system
+        #make -j2
+        #make install
+
+    #- name: cache-riscv
+      #id: cache-riscv
+      #uses: actions/cache@v3
+      #with:
+        #path: rv64gcv-install-next
+        #key: rv64gcv-linux-install-20210504
+
+    #- name: install-riscv-build-deps
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #sudo apt-get update
+        #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler
+
+    #- name: checkout-riscv-gnu-toolchain
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #uses: actions/checkout@v3
+      #with:
+        #repository: riscv/riscv-gnu-toolchain
+        #path: riscv-gnu-toolchain
+        #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3
+    #- name: checkout-riscv-gnu-toolchain-submodules
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #cd riscv-gnu-toolchain
+        #git submodule update --init --recursive --depth 1 glibc
+        #git submodule update --init --recursive --depth 1 newlib
+        #git submodule update --init --recursive --depth 1 riscv-binutils
+        #git submodule update --init --recursive --depth 1 riscv-gcc
+        #git submodule update --init --recursive --depth 1 riscv-dejagnu
+        #git submodule update --init --recursive --depth 1 riscv-gdb
+    #- name: riscv-gnu-toolchain
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: |
+        #cd riscv-gnu-toolchain
+        #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c
+        #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh
+        #make linux
+
+    #- name: riscv-strip-install
+      #if: steps.cache-riscv.outputs.cache-hit != 'true'
+      #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true
+
+    # - name: install-clang
+    #   run: |
+    #     wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz
+    #     tar -xf llvm-project-15.0.1.src.tar.xz
+    #     cd llvm-project-15.0.1.src
+    #     mkdir build
+    #     cd build
+    #     cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/
+    #     make -j16
+    #     make install
+
+    - name: build
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next
+        export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH
+        mkdir build && cd build
+        cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
+        cmake --build . -j 4
+
+    - name: test-vlen256
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export PATH=/data/action/osd/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
+
+    - name: test-vlen128
+      env:
+        LD_LIBRARY_PATH: /data/action/install/lib64
+      run: |
+        export PATH=/data/action/osd/qemu-install/bin:$PATH
+        cd build
+        TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c453d23e544..4ece7ada739 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -296,10 +296,15 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
     include(CheckCXXCompilerFlag)
 
     set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
-    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; word_type vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV)
+    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV)
 
     set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh")
-    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; word_type vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_FP16)
+    check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH)
+
+    if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH)
+        set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
+        check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+    endif()
 
     unset(CMAKE_REQUIRED_FLAGS)
 
@@ -309,9 +314,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)")
         if(NCNN_RVV_CHECK_VFREDSUM)
             include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake)
         endif()
-        if(NOT NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH))
             message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.")
         endif()
+        option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON)
+        if(NCNN_RVV_CHECK_PLAIN_SEGMENT)
+            set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv")
+            check_cxx_source_compiles("#include <riscv_vector.h>\nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG)
+            unset(CMAKE_REQUIRED_FLAGS)
+        endif()
+        if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG)
+            message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.")
+            add_definitions(-D__rvv_tuple)
+        endif()
     else()
         message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.")
     endif()
diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake
index 89d61823deb..8abb13331a9 100644
--- a/cmake/ncnn_add_layer.cmake
+++ b/cmake/ncnn_add_layer.cmake
@@ -271,8 +271,10 @@ macro(ncnn_add_layer class)
     endif()
 
     if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv")
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh")
+        elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+            ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16")
         elseif(NCNN_COMPILER_SUPPORT_RVV)
             ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv")
         endif()
diff --git a/cmake/ncnn_check_rvv_vfredusum.cmake b/cmake/ncnn_check_rvv_vfredusum.cmake
index 81496a765d1..59065556356 100644
--- a/cmake/ncnn_check_rvv_vfredusum.cmake
+++ b/cmake/ncnn_check_rvv_vfredusum.cmake
@@ -9,7 +9,7 @@ int main(void)
 {
     float in1[4] = {-1.f,0.f,+1.f,2.f};
     float out1=0;
-    word_type vl = vsetvl_e32m8(4);
+    size_t vl = vsetvl_e32m8(4);
     vfloat32m8_t _add = vle32_v_f32m8(in1,vl);
     vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl);
     _sum = vfredsum_vs_f32m8_f32m1(_sum, _add, _sum, vl);
@@ -23,7 +23,7 @@ int main(void)
 {
     float in1[4] = {-1.f,0.f,+1.f,2.f};
     float out1=0;
-    word_type vl = vsetvl_e32m8(4);
+    size_t vl = vsetvl_e32m8(4);
     vfloat32m8_t _add = vle32_v_f32m8(in1,vl);
     vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl);
     _sum = vfredusum_vs_f32m8_f32m1(_sum, _add, _sum, vl);
@@ -36,7 +36,7 @@ if(NCNN_COMPILER_USE_VFREDSUM AND NOT NCNN_COMPILER_USE_VFREDUSUM)
     message(WARNING "The compiler uses vfredsum. Upgrading your toolchain is strongly recommended.")
     foreach(LMUL 1 2 4 8)
         add_definitions(-Dvfredusum_vs_f32m${LMUL}_f32m1=vfredsum_vs_f32m${LMUL}_f32m1)
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH)
             add_definitions(-Dvfredusum_vs_f16m${LMUL}_f16m1=vfredsum_vs_f16m${LMUL}_f16m1)
         endif()
     endforeach()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3a851554c34..11b8573462a 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -459,8 +459,10 @@ endif()
 
 if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906)
     if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV)
-        if(NCNN_COMPILER_SUPPORT_RVV_FP16)
+        if(NCNN_COMPILER_SUPPORT_RVV_ZFH)
             target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh)
+        elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH)
+            target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16)
         elseif(NCNN_COMPILER_SUPPORT_RVV)
             target_compile_options(ncnn PRIVATE -march=rv64gcv)
         endif()
diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp
index 4a41788ec9e..092a8b5d6b1 100644
--- a/src/layer/riscv/absval_riscv.cpp
+++ b/src/layer/riscv/absval_riscv.cpp
@@ -66,7 +66,7 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfabs_v_f32m8_absval(_p, vl);
@@ -106,7 +106,7 @@ int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfabs_v_f16m8_absval(_p, vl);
diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp
index b4e53a2c856..9858e654822 100644
--- a/src/layer/riscv/binaryop_riscv.cpp
+++ b/src/layer/riscv/binaryop_riscv.cpp
@@ -67,7 +67,7 @@ static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option&
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _outp = op(a0, _p, vl);
             vse32_v_f32m8(outptr, _outp, vl);
@@ -108,7 +108,7 @@ static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _outp = op(_p, b0, vl);
             vse32_v_f32m8(outptr, _outp, vl);
@@ -149,7 +149,7 @@ static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -217,7 +217,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, _b0x, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -252,7 +252,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w * h * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -289,7 +289,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -328,7 +328,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_a0x, _p1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -367,7 +367,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = size * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -400,7 +400,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, *ptr1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -436,7 +436,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n1 = size1 * elempack1;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n1);
+                        size_t vl = vsetvl_e32m8(n1);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -469,7 +469,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n1 = elempack1;
                         while (n1 > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n1);
+                            size_t vl = vsetvl_e32m8(n1);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _p = vfmv_v_f_f32m8(*ptr, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -508,7 +508,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _outp = op(_p, _b0x, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -545,7 +545,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         const float* ptr1_vol = ptr1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1_vol, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -583,7 +583,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_a0x, _p1, vl);
                             vse32_v_f32m8(outptr, _outp, vl);
@@ -620,7 +620,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                         const float* ptr_vol = ptr;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e32m8(n);
+                            size_t vl = vsetvl_e32m8(n);
                             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
                             vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                             vfloat32m8_t _outp = op(_p, _p1, vl);
@@ -662,7 +662,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _outp = op(_p, _b0x, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -699,7 +699,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -736,7 +736,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w1 * h1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -774,7 +774,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                     int n = w1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                         vfloat32m8_t _outp = op(_a0x, _p1, vl);
                         vse32_v_f32m8(outptr, _outp, vl);
@@ -825,7 +825,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _outp = op(_p, _b0x, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -867,7 +867,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -899,7 +899,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -931,7 +931,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt)
                 int n = w1 * elempack1;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl);
                     vfloat32m8_t _outp = op(_a0x, _p1, vl);
                     vse32_v_f32m8(outptr, _outp, vl);
@@ -985,7 +985,7 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = op(_p, b, vl);
             vse32_v_f32m8(ptr, _p, vl);
@@ -1000,21 +1000,21 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt)
 
 namespace BinaryOp_riscv_functor {
 
-#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV)                                                     \
-    struct NAME                                                                                         \
-    {                                                                                                   \
-        vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const word_type vl) const \
-        {                                                                                               \
-            return IMPLVV;                                                                              \
-        }                                                                                               \
-        vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLVS;                                                                              \
-        }                                                                                               \
-        vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLSV;                                                                              \
-        }                                                                                               \
+#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV)                                                  \
+    struct NAME                                                                                      \
+    {                                                                                                \
+        vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const size_t vl) const \
+        {                                                                                            \
+            return IMPLVV;                                                                           \
+        }                                                                                            \
+        vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLVS;                                                                           \
+        }                                                                                            \
+        vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLSV;                                                                           \
+        }                                                                                            \
     };
 
 MAKE_FUNCTION(binary_op_add_rvv, vfadd_vv_f32m8(x, y, vl), vfadd_vf_f32m8(x, y, vl), vfadd_vf_f32m8(y, x, vl))
@@ -1159,7 +1159,7 @@ static int binary_op_2_3_4_20_fp16s(const Mat& a, const Mat& b, Mat& c, const Op
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _outp = op(a0, _p, vl);
             vse16_v_f16m8(outptr, _outp, vl);
@@ -1200,7 +1200,7 @@ static int binary_op_6_11_16_25_fp16s(const Mat& a, const Mat& b, Mat& c, const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _outp = op(_p, b0, vl);
             vse16_v_f16m8(outptr, _outp, vl);
@@ -1241,7 +1241,7 @@ static int binary_op_7_13_19_29_fp16s(const Mat& a, const Mat& b, Mat& c, const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
             vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1309,7 +1309,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, _b0x, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1344,7 +1344,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w * h * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1381,7 +1381,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1420,7 +1420,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _outp = op(_a0x, _p1, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1459,7 +1459,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = size * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1492,7 +1492,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, *ptr1, vl);
 
@@ -1527,7 +1527,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n1 = size1 * elempack1;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n1);
+                        size_t vl = vsetvl_e16m8(n1);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1560,7 +1560,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n1 = elempack1;
                         while (n1 > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n1);
+                            size_t vl = vsetvl_e16m8(n1);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _p = vfmv_v_f_f16m8(*ptr, vl);
                             vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1598,7 +1598,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w * elempack;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                             vfloat16m8_t _outp = op(_p, _b0x, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1637,7 +1637,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                             const __fp16* ptr1_vol = ptr1 + x * elempack;
                             while (n > 0)
                             {
-                                word_type vl = vsetvl_e16m8(n);
+                                size_t vl = vsetvl_e16m8(n);
                                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                                 vfloat16m8_t _p1 = vle16_v_f16m8(ptr1_vol, vl);
                                 vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1676,7 +1676,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                         int n = w1 * elempack1;
                         while (n > 0)
                         {
-                            word_type vl = vsetvl_e16m8(n);
+                            size_t vl = vsetvl_e16m8(n);
                             vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                             vfloat16m8_t _outp = op(_a0x, _p1, vl);
                             vse16_v_f16m8(outptr, _outp, vl);
@@ -1715,7 +1715,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                             const __fp16* ptr_vol = ptr + x * elempack;
                             while (n > 0)
                             {
-                                word_type vl = vsetvl_e16m8(n);
+                                size_t vl = vsetvl_e16m8(n);
                                 vfloat16m8_t _p = vle16_v_f16m8(ptr_vol, vl);
                                 vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                                 vfloat16m8_t _outp = op(_p, _p1, vl);
@@ -1758,7 +1758,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w * elempack;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                         vfloat16m8_t _outp = op(_p, _b0x, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1795,7 +1795,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1832,7 +1832,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w1 * h1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1870,7 +1870,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                     int n = w1 * elempack1;
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m8(n);
+                        size_t vl = vsetvl_e16m8(n);
                         vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                         vfloat16m8_t _outp = op(_a0x, _p1, vl);
                         vse16_v_f16m8(outptr, _outp, vl);
@@ -1921,7 +1921,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vfloat16m8_t _outp = op(_p, _b0x, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1963,7 +1963,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n1);
+                    size_t vl = vsetvl_e16m8(n1);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -1995,7 +1995,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n1 = size1 * elempack1;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n1);
+                    size_t vl = vsetvl_e16m8(n1);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -2027,7 +2027,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option&
                 int n = w1 * elempack1;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl);
                     vfloat16m8_t _outp = op(_a0x, _p1, vl);
                     vse16_v_f16m8(outptr, _outp, vl);
@@ -2706,7 +2706,7 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = op(_p, b, vl);
             vse16_v_f16m8(ptr, _p, vl);
@@ -2721,25 +2721,25 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt)
 
 namespace BinaryOp_riscv_functor {
 
-#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)                                               \
-    struct NAME                                                                                         \
-    {                                                                                                   \
-        __fp16 operator()(const __fp16& x, const __fp16& y) const                                       \
-        {                                                                                               \
-            return IMPL;                                                                                \
-        }                                                                                               \
-        vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const word_type vl) const \
-        {                                                                                               \
-            return IMPLVV;                                                                              \
-        }                                                                                               \
-        vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLVS;                                                                              \
-        }                                                                                               \
-        vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const word_type vl) const         \
-        {                                                                                               \
-            return IMPLSV;                                                                              \
-        }                                                                                               \
+#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV)                                            \
+    struct NAME                                                                                      \
+    {                                                                                                \
+        __fp16 operator()(const __fp16& x, const __fp16& y) const                                    \
+        {                                                                                            \
+            return IMPL;                                                                             \
+        }                                                                                            \
+        vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \
+        {                                                                                            \
+            return IMPLVV;                                                                           \
+        }                                                                                            \
+        vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLVS;                                                                           \
+        }                                                                                            \
+        vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const size_t vl) const         \
+        {                                                                                            \
+            return IMPLSV;                                                                           \
+        }                                                                                            \
     };
 
 // clang-format off
diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp
index 8ea5d0f05ef..5d0642e7da7 100644
--- a/src/layer/riscv/cast_riscv.cpp
+++ b/src/layer/riscv/cast_riscv.cpp
@@ -101,7 +101,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat16m4_t _outp = vfncvt_f_f_w_f16m4(_p, vl);
@@ -125,7 +125,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
 
                 vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                 vfloat32m8_t _outp = vfwcvt_f_f_v_f32m8(_p, vl);
diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp
index 9acff0218f0..8c43e06a4d8 100644
--- a/src/layer/riscv/clip_riscv.cpp
+++ b/src/layer/riscv/clip_riscv.cpp
@@ -62,7 +62,7 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmax_vf_f32m8(_p, min, vl);
@@ -107,7 +107,7 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfmax_vf_f32m8(_p, min, vl);
@@ -139,7 +139,7 @@ int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfmax_vf_f16m8(_p, min, vl);
diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp
index d80d9985b47..5736fd25dcd 100644
--- a/src/layer/riscv/concat_riscv.cpp
+++ b/src/layer/riscv/concat_riscv.cpp
@@ -143,7 +143,7 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 for (int i = 0; i < bottom_blob.h; i++)
                 {
@@ -266,7 +266,7 @@ int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int size = bottom_blob.w * bottom_blob.h;
 
@@ -487,7 +487,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std:
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 for (int i = 0; i < bottom_blob.h; i++)
                 {
@@ -610,7 +610,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std:
 #if __riscv_vector
             if (bottom_blob.elempack == packn && elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int size = bottom_blob.w * bottom_blob.h;
 
diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp
index a956d394f17..483aa511672 100644
--- a/src/layer/riscv/convolution1d_riscv.cpp
+++ b/src/layer/riscv/convolution1d_riscv.cpp
@@ -119,7 +119,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -476,7 +476,7 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt)
 int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -697,7 +697,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/convolution_1x1_packn.h b/src/layer/riscv/convolution_1x1_packn.h
index 8f55d260abc..31bf72ba3d0 100644
--- a/src/layer/riscv/convolution_1x1_packn.h
+++ b/src/layer/riscv/convolution_1x1_packn.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con
 static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packn_fp16s.h b/src/layer/riscv/convolution_1x1_packn_fp16s.h
index 110d61dc121..5ac3f8967ce 100644
--- a/src/layer/riscv/convolution_1x1_packn_fp16s.h
+++ b/src/layer/riscv/convolution_1x1_packn_fp16s.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl
 static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packnto1.h b/src/layer/riscv/convolution_1x1_packnto1.h
index 0cd1747586e..a3e1204a325 100644
--- a/src/layer/riscv/convolution_1x1_packnto1.h
+++ b/src/layer/riscv/convolution_1x1_packnto1.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
index 04e86f97dca..10591ab27f2 100644
--- a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h
@@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top
 static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_3x3_pack1ton.h b/src/layer/riscv/convolution_3x3_pack1ton.h
index bb123ef8997..9adcfb1e263 100644
--- a/src/layer/riscv/convolution_3x3_pack1ton.h
+++ b/src/layer/riscv/convolution_3x3_pack1ton.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int inch = bottom_blob.c;
     int outw = top_blob.w;
@@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const
 static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
index e25c7d09097..bff24a0099f 100644
--- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int inch = bottom_blob.c;
     int outw = top_blob.w;
@@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_7x7_pack1ton.h b/src/layer/riscv/convolution_7x7_pack1ton.h
index 06c4dfe2f6a..3605ed027cd 100644
--- a/src/layer/riscv/convolution_7x7_pack1ton.h
+++ b/src/layer/riscv/convolution_7x7_pack1ton.h
@@ -15,7 +15,7 @@
 static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
index 91ee1b7d826..01804bf391d 100644
--- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_pack1ton.h b/src/layer/riscv/convolution_pack1ton.h
index f667f4d5d09..15eec7badd9 100644
--- a/src/layer/riscv/convolution_pack1ton.h
+++ b/src/layer/riscv/convolution_pack1ton.h
@@ -15,7 +15,7 @@
 static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h
index fc486173031..6f8c649e632 100644
--- a/src/layer/riscv/convolution_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -95,7 +95,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
 static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packn.h b/src/layer/riscv/convolution_packn.h
index c9b51d07881..9d18c1d858e 100644
--- a/src/layer/riscv/convolution_packn.h
+++ b/src/layer/riscv/convolution_packn.h
@@ -15,7 +15,7 @@
 static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h
index 8ae4468495a..1f7b308e846 100644
--- a/src/layer/riscv/convolution_packn_fp16s.h
+++ b/src/layer/riscv/convolution_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -100,7 +100,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c
 static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packnto1.h b/src/layer/riscv/convolution_packnto1.h
index 7eda3858083..4c66116d20e 100644
--- a/src/layer/riscv/convolution_packnto1.h
+++ b/src/layer/riscv/convolution_packnto1.h
@@ -15,7 +15,7 @@
 static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h
index 63aefbb5d5a..83efd3081f8 100644
--- a/src/layer/riscv/convolution_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
@@ -109,7 +109,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob
 static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int channels = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h
index c62db6c78ee..801b7cc456f 100644
--- a/src/layer/riscv/convolution_sgemm.h
+++ b/src/layer/riscv/convolution_sgemm.h
@@ -16,7 +16,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat&
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h
index 5cd5ea8a31e..72a621641db 100644
--- a/src/layer/riscv/convolution_sgemm_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_fp16s.h
@@ -16,7 +16,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 #endif
 
     // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_sgemm_pack1ton.h b/src/layer/riscv/convolution_sgemm_pack1ton.h
index bc2f558a6d9..8a3e6ffbc43 100644
--- a/src/layer/riscv/convolution_sgemm_pack1ton.h
+++ b/src/layer/riscv/convolution_sgemm_pack1ton.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator);
 
diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
index c3590a6ed6b..0c0b2791a8f 100644
--- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator);
 
diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h
index 88518a23136..9255c092ae4 100644
--- a/src/layer/riscv/convolution_sgemm_packn.h
+++ b/src/layer/riscv/convolution_sgemm_packn.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -78,7 +78,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
                     vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl);
                     vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl);
                     vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl);
-                    vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -119,7 +119,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
                     vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
                     vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl);
-                    vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -156,7 +156,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
 #else
                     vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
-                    vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                    vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -363,7 +363,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons
 static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h
index 977dc38204a..cb3b65196ed 100644
--- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -109,7 +109,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
                     vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl);
                     vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl);
                     vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl);
-                    vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -172,7 +172,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
                     vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
                     vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl);
-                    vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -228,7 +228,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
 #else
                     vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
-                    vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                    vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -435,7 +435,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo
 static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h
index 212cf98b39b..2df2c7d7656 100644
--- a/src/layer/riscv/convolution_sgemm_packnto1.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                     vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl);
                     vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl);
                     vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl);
-                    vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
                     vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl);
                     vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl);
-                    vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 #else
                     vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl);
                     vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl);
-                    vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                    vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
     int nn_outch = outch / packn;
     int remain_outch_start = nn_outch * packn;
 
+#ifdef __clang__
+    // clang complains about VLA in the following loop
+    float* _zero_tmp = new float[packn]();
+    for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++)
+    {
+        _zero_tmp[_zero_clean_idx] = 0.f;
+    }
+#endif // __clang__
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_outch; pp++)
     {
@@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
         float* outptr0 = top_blob.channel(p);
 
+#ifdef __clang__
+        const float* zeros = _zero_tmp;
+#else
         const float zeros[packn] = {0.f};
+#endif // __clang__
         const float* biasptr = bias ? bias + p : zeros;
 
         int i = 0;
@@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl);
             vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl);
 #else
-            vssseg8e32_v_f32m1x8(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl);
+            vssseg8e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
 #endif
             outptr0 += 8;
         }
@@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl);
             vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl);
 #else
-            vssseg4e32_v_f32m1x4(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x4(_sum0, _sum1, _sum2, _sum3), vl);
+            vssseg4e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, vl);
 #endif
             outptr0 += 4;
         }
@@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl);
             vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl);
 #else
-            vssseg2e32_v_f32m1x2(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x2(_sum0, _sum1), vl);
+            vssseg2e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, vl);
 #endif
             outptr0 += 2;
         }
@@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
             outptr0 += 1;
         }
     }
+#ifdef __clang__
+    delete[] _zero_tmp;
+#endif
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_outch_start; p < outch; p++)
@@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x8_t _val01 = vlseg8e32_v_f32m1x8(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vfloat32m1_t _val2;
+                vfloat32m1_t _val3;
+                vfloat32m1_t _val4;
+                vfloat32m1_t _val5;
+                vfloat32m1_t _val6;
+                vfloat32m1_t _val7;
+                vlseg8e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x8_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x8_f32m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x8_f32m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x8_f32m1(_val01, 3), _w0, vl);
-                _sum4 = vfmacc_vv_f32m1(_sum4, vget_f32m1x8_f32m1(_val01, 4), _w0, vl);
-                _sum5 = vfmacc_vv_f32m1(_sum5, vget_f32m1x8_f32m1(_val01, 5), _w0, vl);
-                _sum6 = vfmacc_vv_f32m1(_sum6, vget_f32m1x8_f32m1(_val01, 6), _w0, vl);
-                _sum7 = vfmacc_vv_f32m1(_sum7, vget_f32m1x8_f32m1(_val01, 7), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl);
+                _sum4 = vfmacc_vv_f32m1(_sum4, _val4, _w0, vl);
+                _sum5 = vfmacc_vv_f32m1(_sum5, _val5, _w0, vl);
+                _sum6 = vfmacc_vv_f32m1(_sum6, _val6, _w0, vl);
+                _sum7 = vfmacc_vv_f32m1(_sum7, _val7, _w0, vl);
                 tmpptr += packn * 8;
                 kptr0 += packn;
             }
@@ -463,12 +486,16 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x4_t _val01 = vlseg4e32_v_f32m1x4(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vfloat32m1_t _val2;
+                vfloat32m1_t _val3;
+                vlseg4e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x4_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x4_f32m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x4_f32m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x4_f32m1(_val01, 3), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl);
                 tmpptr += packn * 4;
                 kptr0 += packn;
             }
@@ -519,10 +546,12 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat32m1x2_t _val01 = vlseg2e32_v_f32m1x2(tmpptr, vl);
+                vfloat32m1_t _val0;
+                vfloat32m1_t _val1;
+                vlseg2e32_v_f32m1(&_val0, &_val1, tmpptr, vl);
                 vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x2_f32m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x2_f32m1(_val01, 1), _w0, vl);
+                _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl);
                 tmpptr += packn * 2;
                 kptr0 += packn;
             }
@@ -648,7 +677,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_rvv(const Mat& _k
 static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
index d6dd867397c..925713d9826 100644
--- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
+++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                     vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl);
                     vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl);
                     vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl);
-                    vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                    vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 8;
@@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
                     vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl);
                     vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl);
-                    vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                    vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 4;
@@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 #else
                     vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl);
                     vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl);
-                    vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                    vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                     img0 += size * packn;
                     tmpptr += packn * 2;
@@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
     int nn_outch = outch / packn;
     int remain_outch_start = nn_outch * packn;
 
+    // make clang happy with the following loop
+#ifdef __clang__
+    __fp16* _zero_tmp = new __fp16[packn]();
+    for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++)
+    {
+        _zero_tmp[_zero_clean_idx] = 0.f;
+    }
+#endif // __clang__
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int pp = 0; pp < nn_outch; pp++)
     {
@@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
         __fp16* outptr0 = top_blob.channel(p);
 
+#ifdef __clang__
+        const __fp16* zeros = _zero_tmp;
+#else
         const __fp16 zeros[packn] = {0.f};
+#endif // __clang__
         const __fp16* biasptr = bias ? bias + p : zeros;
 
         int i = 0;
@@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl);
             vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl);
 #else
-            vssseg8e16_v_f16m1x8(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl);
+            vssseg8e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl);
 #endif
             outptr0 += 8;
         }
@@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl);
             vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl);
 #else
-            vssseg4e16_v_f16m1x4(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x4(_sum0, _sum1, _sum2, _sum3), vl);
+            vssseg4e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, vl);
 #endif
             outptr0 += 4;
         }
@@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl);
             vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl);
 #else
-            vssseg2e16_v_f16m1x2(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x2(_sum0, _sum1), vl);
+            vssseg2e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, vl);
 #endif
             outptr0 += 2;
         }
@@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
             outptr0 += 1;
         }
     }
+#ifdef __clang__
+    delete[] _zero_tmp;
+#endif // __clang__
 
     #pragma omp parallel for num_threads(opt.num_threads)
     for (int p = remain_outch_start; p < outch; p++)
@@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x8_t _val01 = vlseg8e16_v_f16m1x8(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vfloat16m1_t _val2;
+                vfloat16m1_t _val3;
+                vfloat16m1_t _val4;
+                vfloat16m1_t _val5;
+                vfloat16m1_t _val6;
+                vfloat16m1_t _val7;
+                vlseg8e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x8_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x8_f16m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x8_f16m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x8_f16m1(_val01, 3), _w0, vl);
-                _sum4 = vfmacc_vv_f16m1(_sum4, vget_f16m1x8_f16m1(_val01, 4), _w0, vl);
-                _sum5 = vfmacc_vv_f16m1(_sum5, vget_f16m1x8_f16m1(_val01, 5), _w0, vl);
-                _sum6 = vfmacc_vv_f16m1(_sum6, vget_f16m1x8_f16m1(_val01, 6), _w0, vl);
-                _sum7 = vfmacc_vv_f16m1(_sum7, vget_f16m1x8_f16m1(_val01, 7), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
+                _sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl);
+                _sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl);
+                _sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl);
+                _sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl);
                 tmpptr += packn * 8;
                 kptr0 += packn;
             }
@@ -463,12 +486,17 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x4_t _val01 = vlseg4e16_v_f16m1x4(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vfloat16m1_t _val2;
+                vfloat16m1_t _val3;
+
+                vlseg4e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x4_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x4_f16m1(_val01, 1), _w0, vl);
-                _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x4_f16m1(_val01, 2), _w0, vl);
-                _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x4_f16m1(_val01, 3), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
+                _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl);
+                _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl);
                 tmpptr += packn * 4;
                 kptr0 += packn;
             }
@@ -519,10 +547,12 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_
 
             for (int j = 0; j < nn; j++)
             {
-                vfloat16m1x2_t _val01 = vlseg2e16_v_f16m1x2(tmpptr, vl);
+                vfloat16m1_t _val0;
+                vfloat16m1_t _val1;
+                vlseg2e16_v_f16m1(&_val0, &_val1, tmpptr, vl);
                 vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl);
-                _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x2_f16m1(_val01, 0), _w0, vl);
-                _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x2_f16m1(_val01, 1), _w0, vl);
+                _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl);
+                _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl);
                 tmpptr += packn * 2;
                 kptr0 += packn;
             }
@@ -648,7 +678,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(const
 static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int inch = bottom_blob.c;
diff --git a/src/layer/riscv/convolution_winograd_dot.h b/src/layer/riscv/convolution_winograd_dot.h
index 8ea6bc9c576..c0a7b7680f8 100644
--- a/src/layer/riscv/convolution_winograd_dot.h
+++ b/src/layer/riscv/convolution_winograd_dot.h
@@ -16,7 +16,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator);
diff --git a/src/layer/riscv/convolution_winograd_dot_packn.h b/src/layer/riscv/convolution_winograd_dot_packn.h
index 434eaa00c68..1c505d5c2e1 100644
--- a/src/layer/riscv/convolution_winograd_dot_packn.h
+++ b/src/layer/riscv/convolution_winograd_dot_packn.h
@@ -15,7 +15,7 @@
 static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u * packn, packn, opt.workspace_allocator);
 
@@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
                 vfloat32m1_t _val5 = vle32_v_f32m1(r0 + packn * 5, vl);
                 vfloat32m1_t _val6 = vle32_v_f32m1(r0 + packn * 6, vl);
                 vfloat32m1_t _val7 = vle32_v_f32m1(r0 + packn * 7, vl);
-                vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 8;
@@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
                 vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl);
                 vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl);
                 vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl);
-                vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl);
+                vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 4;
@@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c
 #else
                 vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl);
                 vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl);
-                vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl);
+                vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 2;
diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
index 0b731519426..ed35ad3e378 100644
--- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
+++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u * packn, packn, opt.workspace_allocator);
 
@@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
                 vfloat16m1_t _val5 = vle16_v_f16m1(r0 + packn * 5, vl);
                 vfloat16m1_t _val6 = vle16_v_f16m1(r0 + packn * 6, vl);
                 vfloat16m1_t _val7 = vle16_v_f16m1(r0 + packn * 7, vl);
-                vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl);
+                vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 8;
@@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
                 vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl);
                 vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl);
                 vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl);
-                vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl);
+                vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 4;
@@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o
 #else
                 vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl);
                 vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl);
-                vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl);
+                vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl);
 
                 r0 += bottom_blob_tm.cstep * packn;
                 tmpptr += packn * 2;
diff --git a/src/layer/riscv/convolution_winograd_transform_packn.h b/src/layer/riscv/convolution_winograd_transform_packn.h
index db3a05aa92f..f5a52970759 100644
--- a/src/layer/riscv/convolution_winograd_transform_packn.h
+++ b/src/layer/riscv/convolution_winograd_transform_packn.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_
 static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_
 static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo
 static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
index b1b1ad9f54d..2404a8a4092 100644
--- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
+++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to
 static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
@@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to
 static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int w = bottom_blob.w;
     const int h = bottom_blob.h;
@@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot
 static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const int outw = top_blob.w;
     const int outh = top_blob.h;
diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn.h b/src/layer/riscv/convolutiondepthwise_3x3_packn.h
index d8aa0ec4ee0..0cab1af0802 100644
--- a/src/layer/riscv/convolutiondepthwise_3x3_packn.h
+++ b/src/layer/riscv/convolutiondepthwise_3x3_packn.h
@@ -15,7 +15,7 @@
 static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
@@ -302,7 +302,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M
 static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
index c3d73053bea..d479385f6a2 100644
--- a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
+++ b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
@@ -302,7 +302,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn.h b/src/layer/riscv/convolutiondepthwise_5x5_packn.h
index cd35ef8e816..2ef2fea7455 100644
--- a/src/layer/riscv/convolutiondepthwise_5x5_packn.h
+++ b/src/layer/riscv/convolutiondepthwise_5x5_packn.h
@@ -15,7 +15,7 @@
 static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
@@ -336,7 +336,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M
 static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
index 1647f96db8c..08270e307c9 100644
--- a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
+++ b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
@@ -336,7 +336,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
 
diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp
index e33360e0609..eb39ac0baa7 100644
--- a/src/layer/riscv/convolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp
@@ -282,7 +282,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -710,7 +710,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -920,7 +920,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b
 int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp
index f7b44efd1a1..80e76fc47b4 100644
--- a/src/layer/riscv/crop_riscv.cpp
+++ b/src/layer/riscv/crop_riscv.cpp
@@ -43,7 +43,7 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack
     int h = dst.h;
     int right = src.w - dst.w - left;
 
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     const float* ptr = src.row(top) + left * packn;
     float* outptr = dst;
@@ -69,7 +69,7 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef
     int h = dst.h;
     int right = src.w - dst.w - left;
 
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     const unsigned short* ptr = src.row<unsigned short>(top) + left * packn;
     unsigned short* outptr = dst;
diff --git a/src/layer/riscv/deconvolution_pack1ton.h b/src/layer/riscv/deconvolution_pack1ton.h
index dfbe8e01a2d..ec18f62c1c6 100644
--- a/src/layer/riscv/deconvolution_pack1ton.h
+++ b/src/layer/riscv/deconvolution_pack1ton.h
@@ -15,7 +15,7 @@
 static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h
index a1fcfefc254..168c709217d 100644
--- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h
+++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -103,7 +103,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
 static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packn.h b/src/layer/riscv/deconvolution_packn.h
index 457e2b95c92..8cab6c3b0a1 100644
--- a/src/layer/riscv/deconvolution_packn.h
+++ b/src/layer/riscv/deconvolution_packn.h
@@ -15,7 +15,7 @@
 static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h
index 46d52470ad0..62fbd2eb731 100644
--- a/src/layer/riscv/deconvolution_packn_fp16s.h
+++ b/src/layer/riscv/deconvolution_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -105,7 +105,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob,
 static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packnto1.h b/src/layer/riscv/deconvolution_packnto1.h
index ba81baf3676..2efa9b154d2 100644
--- a/src/layer/riscv/deconvolution_packnto1.h
+++ b/src/layer/riscv/deconvolution_packnto1.h
@@ -15,7 +15,7 @@
 static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h
index 5cb0a3c49bd..ab70100fb3b 100644
--- a/src/layer/riscv/deconvolution_packnto1_fp16s.h
+++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h
@@ -15,7 +15,7 @@
 static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -116,7 +116,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl
 static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
index ab20e6c4148..b53e8962fd2 100644
--- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
+++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp
@@ -210,7 +210,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob,
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     // convolv with NxN kernel
@@ -518,7 +518,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt)
 int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -739,7 +739,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top
 int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp
index fc71db7689a..461edf2d056 100644
--- a/src/layer/riscv/dropout_riscv.cpp
+++ b/src/layer/riscv/dropout_riscv.cpp
@@ -53,7 +53,7 @@ int Dropout_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmul_vf_f32m8(_p, scale, vl);
diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp
index 325ab6f175d..491c051c7fe 100644
--- a/src/layer/riscv/flatten_riscv.cpp
+++ b/src/layer/riscv/flatten_riscv.cpp
@@ -119,7 +119,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
                     vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl);
@@ -147,7 +147,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p = vle32_v_f32m1(ptr, vl);
                     vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl);
@@ -172,7 +172,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
 
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vse32_v_f32m8(outptr, _p, vl);
@@ -262,7 +262,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p = vle16_v_u16m1(ptr, vl);
                     vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl);
@@ -290,7 +290,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p = vle16_v_u16m1(ptr, vl);
                     vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl);
@@ -315,7 +315,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
 
                     vuint16m8_t _p = vle16_v_u16m8(ptr, vl);
                     vse16_v_u16m8(outptr, _p, vl);
@@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m1(n);
+                    size_t vl = vsetvl_e8m1(n);
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl);
@@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m1(n);
+                    size_t vl = vsetvl_e8m1(n);
 
                     vint8m1_t _p = vle8_v_i8m1(ptr, vl);
                     vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl);
@@ -458,7 +458,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
                 int n = size * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e8m8(n);
+                    size_t vl = vsetvl_e8m8(n);
 
                     vint8m8_t _p = vle8_v_i8m8(ptr, vl);
                     vse8_v_i8m8(outptr, _p, vl);
diff --git a/src/layer/riscv/gelu_riscv.cpp b/src/layer/riscv/gelu_riscv.cpp
index 708e951e5a3..69b374998f3 100644
--- a/src/layer/riscv/gelu_riscv.cpp
+++ b/src/layer/riscv/gelu_riscv.cpp
@@ -48,7 +48,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vfloat32m4_t _p = vle32_v_f32m4(ptr, vl);
 
@@ -77,7 +77,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 auto _p = vle32_v_f32m8(ptr, vl);
                 auto _perfc = vfmul_vf_f32m8(_p, -.70710678f, vl);
                 _p = vfmul_vf_f32m8(_p, .5f, vl);
diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp
index e45d37592ef..28afa5081d0 100644
--- a/src/layer/riscv/gru_riscv.cpp
+++ b/src/layer/riscv/gru_riscv.cpp
@@ -63,7 +63,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl);
                 vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
                 vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
@@ -93,7 +93,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e32m8(n_out);
+                size_t vl = vsetvl_e32m8(n_out);
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
                 vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
                 vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
@@ -136,7 +136,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n_out2);
+                size_t vl = vsetvl_e32m8(n_out2);
 
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
                 vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
@@ -160,7 +160,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we
             const float* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n2);
+                size_t vl = vsetvl_e32m8(n2);
 
                 vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl);
                 vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
@@ -428,7 +428,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl);
                 vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl);
                 vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl);
@@ -458,7 +458,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out);
+                size_t vl = vsetvl_e16m4(n_out);
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl);
                 vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl);
                 vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl);
@@ -501,7 +501,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out2);
+                size_t vl = vsetvl_e16m4(n_out2);
 
                 vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl);
                 vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl);
@@ -525,7 +525,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M
             const float* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n2);
+                size_t vl = vsetvl_e16m4(n2);
 
                 vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl);
                 vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl);
@@ -758,7 +758,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_xcu = weight_xc_U;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
                 vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl);
                 vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl);
                 vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl);
@@ -785,7 +785,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_hcu = weight_hc_U;
             while (n_out > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out);
+                size_t vl = vsetvl_e16m4(n_out);
                 vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl);
                 vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl);
                 vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl);
@@ -825,7 +825,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_whc_n = weight_hc_N;
             while (n_out2 > 0)
             {
-                word_type vl = vsetvl_e16m4(n_out2);
+                size_t vl = vsetvl_e16m4(n_out2);
 
                 vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl);
                 vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl);
@@ -846,7 +846,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const
             const __fp16* ptr_xcn = weight_xc_N;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e16m8(n2);
+                size_t vl = vsetvl_e16m8(n2);
 
                 vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl);
                 vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl);
diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp
index 2c3bbec2886..112a1c9c8d2 100644
--- a/src/layer/riscv/hardsigmoid_riscv.cpp
+++ b/src/layer/riscv/hardsigmoid_riscv.cpp
@@ -60,7 +60,7 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl);
@@ -111,7 +111,7 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option&
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
             vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl);
diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp
index b60197115ca..5d68e07b06a 100644
--- a/src/layer/riscv/hardswish_riscv.cpp
+++ b/src/layer/riscv/hardswish_riscv.cpp
@@ -60,7 +60,7 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl);
@@ -111,7 +111,7 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
             vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl);
diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp
index 721c6361b8b..30dd7428777 100644
--- a/src/layer/riscv/innerproduct_riscv.cpp
+++ b/src/layer/riscv/innerproduct_riscv.cpp
@@ -198,7 +198,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 #if __riscv_vector
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -237,7 +237,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -273,7 +273,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 float* outptr = top_blob.row(j);
 
@@ -372,7 +372,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
             vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
 
             if (bias_term)
@@ -414,7 +414,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt
         {
             int p = pp * packn;
 
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
             vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl);
 
             if (bias_term)
@@ -595,7 +595,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
         {
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -635,7 +635,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -672,7 +672,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -765,7 +765,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
             vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl);
 
             if (bias_term)
@@ -857,7 +857,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
         {
             if (elempack == packn && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -897,7 +897,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 
             if (elempack == 1 && num_output_elempack == packn)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -934,7 +934,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
 
             if (elempack == packn && num_output_elempack == 1)
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 __fp16* outptr = top_blob.row<__fp16>(j);
 
@@ -1027,7 +1027,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co
         #pragma omp parallel for num_threads(opt.num_threads)
         for (int p = 0; p < num_output / out_elempack; p++)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
             vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl);
 
             if (bias_term)
diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h
index 16ed365ff53..4c4eb869c43 100644
--- a/src/layer/riscv/interp_bicubic_packn.h
+++ b/src/layer/riscv/interp_bicubic_packn.h
@@ -15,7 +15,7 @@
 static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h
index b83a9eba3c6..ff2284552b7 100644
--- a/src/layer/riscv/interp_bicubic_packn_fp16s.h
+++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
@@ -244,7 +244,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al
 static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h
index 1742626017a..0f6338d7310 100644
--- a/src/layer/riscv/interp_bilinear.h
+++ b/src/layer/riscv/interp_bilinear.h
@@ -86,16 +86,17 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             int n = w;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl);
-                vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl);
-                vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0);
-                vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1);
 
-                vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl);
-                vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0);
-                vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1);
+                vfloat32m4_t _S1p0;
+                vfloat32m4_t _S1p1;
+                vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl);
+
+                vfloat32m4_t _a0;
+                vfloat32m4_t _a1;
+                vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl);
 
                 vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl);
 
@@ -135,19 +136,21 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
             int n = w;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m4(n);
+                size_t vl = vsetvl_e32m4(n);
 
                 vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl);
-                vfloat32m4x2_t _S0p = vloxseg2ei32_v_f32m4x2(S0, _sx, vl);
-                vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl);
-                vfloat32m4_t _S0p0 = vget_f32m4x2_f32m4(_S0p, 0);
-                vfloat32m4_t _S0p1 = vget_f32m4x2_f32m4(_S0p, 1);
-                vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0);
-                vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1);
 
-                vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl);
-                vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0);
-                vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1);
+                vfloat32m4_t _S0p0;
+                vfloat32m4_t _S0p1;
+                vfloat32m4_t _S1p0;
+                vfloat32m4_t _S1p1;
+
+                vloxseg2ei32_v_f32m4(&_S0p0, &_S0p1, S0, _sx, vl);
+                vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl);
+
+                vfloat32m4_t _a0;
+                vfloat32m4_t _a1;
+                vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl);
 
                 vfloat32m4_t _rows0 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl);
                 vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl);
@@ -192,7 +195,7 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h
index 091e86b7301..cd61af6efac 100644
--- a/src/layer/riscv/interp_bilinear_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_fp16s.h
@@ -131,7 +131,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha,
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl);
             vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl);
@@ -232,7 +232,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha
         int n = w;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl);
             vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl);
diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h
index 0d800e324cb..9dffc01bf30 100644
--- a/src/layer/riscv/interp_bilinear_packn.h
+++ b/src/layer/riscv/interp_bilinear_packn.h
@@ -15,7 +15,7 @@
 static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h
index b48fd8431a4..dfe02c00d1b 100644
--- a/src/layer/riscv/interp_bilinear_packn_fp16s.h
+++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h
@@ -15,7 +15,7 @@
 static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
@@ -122,7 +122,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a
 static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = dst.w;
     int h = dst.h;
diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp
index b72cfd00280..ea8344985ed 100644
--- a/src/layer/riscv/interp_riscv.cpp
+++ b/src/layer/riscv/interp_riscv.cpp
@@ -88,7 +88,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 #if __riscv_vector
         if (elempack == packn)
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
 
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < w; q++)
@@ -130,7 +130,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
         {
             if (resize_type == 1) // nearest
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 const float ws = output_width ? w / (float)outw : 1.f / width_scale;
 
@@ -153,7 +153,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -190,7 +190,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e32m1(packn);
+                const size_t vl = vsetvl_e32m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -328,7 +328,7 @@ int Interp_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>
     {
         if (resize_type == 1) // nearest
         {
-            const word_type vl = vsetvl_e32m1(packn);
+            const size_t vl = vsetvl_e32m1(packn);
 
             const float hs = output_height ? h / (float)outh : 1.f / height_scale;
             const float ws = output_width ? w / (float)outw : 1.f / width_scale;
@@ -518,7 +518,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
         if (elempack == packn)
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
 
             #pragma omp parallel for num_threads(opt.num_threads)
             for (int q = 0; q < w; q++)
@@ -558,7 +558,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
         {
             if (resize_type == 1) // nearest
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 const float ws = output_width ? w / (float)outw : 1.f / width_scale;
 
@@ -581,7 +581,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -618,7 +618,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -754,7 +754,7 @@ int Interp_riscv::forward_fp16s(const std::vector<Mat>& bottom_blobs, std::vecto
     {
         if (resize_type == 1) // nearest
         {
-            const word_type vl = vsetvl_e16m1(packn);
+            const size_t vl = vsetvl_e16m1(packn);
 
             const float hs = output_height ? h / (float)outh : 1.f / height_scale;
             const float ws = output_width ? w / (float)outw : 1.f / width_scale;
@@ -955,7 +955,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
         {
             if (resize_type == 2) // bilinear
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
@@ -992,7 +992,7 @@ int Interp_riscv::forward_fp16sa(const std::vector<Mat>& bottom_blobs, std::vect
 
             if (resize_type == 3) // bicubic
             {
-                const word_type vl = vsetvl_e16m1(packn);
+                const size_t vl = vsetvl_e16m1(packn);
 
                 int* buf = new int[outw + outw * packn];
 
diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp
index abee1ec3748..4ddb1470006 100644
--- a/src/layer/riscv/mish_riscv.cpp
+++ b/src/layer/riscv/mish_riscv.cpp
@@ -64,7 +64,7 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
@@ -103,7 +103,7 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
@@ -134,7 +134,7 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfmul_vv_f16m8(_p, tanh_ps(log_ps(vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl);
diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp
index 1805c2469eb..5c298da522d 100644
--- a/src/layer/riscv/packing_riscv.cpp
+++ b/src/layer/riscv/packing_riscv.cpp
@@ -18,6 +18,8 @@
 #include <riscv_vector.h>
 #endif // __riscv_vector
 
+#include "riscv_usability.h"
+
 namespace ncnn {
 
 Packing_riscv::Packing_riscv()
@@ -137,13 +139,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
 
                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
-                    vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -181,13 +183,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
+
+                    vfloat32m2_t _p0;
+                    vfloat32m2_t _p1;
+                    vfloat32m2_t _p2;
+                    vfloat32m2_t _p3;
+                    vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
 
-                    vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
-                    vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
-                    vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
-                    vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
-                    vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
+                    vse32_v_f32m2(outptr0, _p0, vl);
+                    vse32_v_f32m2(outptr1, _p1, vl);
+                    vse32_v_f32m2(outptr2, _p2, vl);
+                    vse32_v_f32m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -229,7 +236,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl);
                     vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl);
@@ -239,7 +246,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                     vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl);
                     vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl);
                     vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -289,17 +296,25 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl);
-                    vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl);
-                    vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl);
-                    vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl);
-                    vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl);
-                    vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl);
-                    vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl);
-                    vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vse32_v_f32m1(outptr0, _p0, vl);
+                    vse32_v_f32m1(outptr1, _p1, vl);
+                    vse32_v_f32m1(outptr2, _p2, vl);
+                    vse32_v_f32m1(outptr3, _p3, vl);
+                    vse32_v_f32m1(outptr4, _p4, vl);
+                    vse32_v_f32m1(outptr5, _p5, vl);
+                    vse32_v_f32m1(outptr6, _p6, vl);
+                    vse32_v_f32m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -343,19 +358,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl);
-                    vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl);
-                    vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0);
-                    vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1);
-                    vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2);
-                    vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3);
-                    vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0);
-                    vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1);
-                    vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2);
-                    vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p00;
+                    vfloat32m1_t _p01;
+                    vfloat32m1_t _p02;
+                    vfloat32m1_t _p03;
+                    vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
+
+                    vfloat32m1_t _p10;
+                    vfloat32m1_t _p11;
+                    vfloat32m1_t _p12;
+                    vfloat32m1_t _p13;
+                    vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -395,19 +412,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0);
-                    vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1);
-                    vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2);
-                    vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3);
-                    vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4);
-                    vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5);
-                    vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6);
-                    vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7);
-                    vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -466,13 +483,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
+                    size_t vl = vsetvl_e32m2(n);
 
                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
-                    vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -510,13 +527,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m2(n);
-
-                    vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
-                    vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
-                    vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
-                    vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
-                    vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
+                    size_t vl = vsetvl_e32m2(n);
+                    vfloat32m2_t _p0;
+                    vfloat32m2_t _p1;
+                    vfloat32m2_t _p2;
+                    vfloat32m2_t _p3;
+                    vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse32_v_f32m2(outptr0, _p0, vl);
+                    vse32_v_f32m2(outptr1, _p1, vl);
+                    vse32_v_f32m2(outptr2, _p2, vl);
+                    vse32_v_f32m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -558,7 +578,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
                     vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl);
                     vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl);
@@ -568,7 +588,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                     vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl);
                     vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl);
                     vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -618,17 +638,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl);
-                    vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl);
-                    vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl);
-                    vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl);
-                    vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl);
-                    vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl);
-                    vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl);
-                    vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vse32_v_f32m1(outptr0, _p0, vl);
+                    vse32_v_f32m1(outptr1, _p1, vl);
+                    vse32_v_f32m1(outptr2, _p2, vl);
+                    vse32_v_f32m1(outptr3, _p3, vl);
+                    vse32_v_f32m1(outptr4, _p4, vl);
+                    vse32_v_f32m1(outptr5, _p5, vl);
+                    vse32_v_f32m1(outptr6, _p6, vl);
+                    vse32_v_f32m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -672,20 +701,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
+                    size_t vl = vsetvl_e32m1(n);
 
-                    vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl);
-                    vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl);
+                    vfloat32m1_t _p00;
+                    vfloat32m1_t _p01;
+                    vfloat32m1_t _p02;
+                    vfloat32m1_t _p03;
+                    vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
 
-                    vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0);
-                    vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1);
-                    vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2);
-                    vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3);
-                    vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0);
-                    vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1);
-                    vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2);
-                    vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3);
-                    vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    vfloat32m1_t _p10;
+                    vfloat32m1_t _p11;
+                    vfloat32m1_t _p12;
+                    vfloat32m1_t _p13;
+                    vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -725,19 +755,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m1(n);
-
-                    vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl);
-                    vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0);
-                    vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1);
-                    vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2);
-                    vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3);
-                    vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4);
-                    vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5);
-                    vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6);
-                    vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7);
-                    vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e32m1(n);
+
+                    vfloat32m1_t _p0;
+                    vfloat32m1_t _p1;
+                    vfloat32m1_t _p2;
+                    vfloat32m1_t _p3;
+                    vfloat32m1_t _p4;
+                    vfloat32m1_t _p5;
+                    vfloat32m1_t _p6;
+                    vfloat32m1_t _p7;
+                    vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -859,13 +889,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
+                    size_t vl = vsetvl_e16m2(n);
 
                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
-                    vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -903,13 +933,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
-
-                    vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
-                    vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
-                    vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
-                    vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
-                    vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
+                    size_t vl = vsetvl_e16m2(n);
+
+                    vuint16m2_t _p0;
+                    vuint16m2_t _p1;
+                    vuint16m2_t _p2;
+                    vuint16m2_t _p3;
+                    vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse16_v_u16m2(outptr0, _p0, vl);
+                    vse16_v_u16m2(outptr1, _p1, vl);
+                    vse16_v_u16m2(outptr2, _p2, vl);
+                    vse16_v_u16m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -951,7 +985,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
@@ -961,7 +995,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1011,17 +1045,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
-                    vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
-                    vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
-                    vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
-                    vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
-                    vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
-                    vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
-                    vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vse16_v_u16m1(outptr0, _p0, vl);
+                    vse16_v_u16m1(outptr1, _p1, vl);
+                    vse16_v_u16m1(outptr2, _p2, vl);
+                    vse16_v_u16m1(outptr3, _p3, vl);
+                    vse16_v_u16m1(outptr4, _p4, vl);
+                    vse16_v_u16m1(outptr5, _p5, vl);
+                    vse16_v_u16m1(outptr6, _p6, vl);
+                    vse16_v_u16m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -1065,19 +1108,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
-                    vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
-                    vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
-                    vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
-                    vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
-                    vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
-                    vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
-                    vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
-                    vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
-                    vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p00;
+                    vuint16m1_t _p01;
+                    vuint16m1_t _p02;
+                    vuint16m1_t _p03;
+                    vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
+
+                    vuint16m1_t _p10;
+                    vuint16m1_t _p11;
+                    vuint16m1_t _p12;
+                    vuint16m1_t _p13;
+                    vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
+
+                    vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -1117,19 +1162,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = w;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
-                    vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
-                    vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
-                    vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
-                    vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
-                    vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
-                    vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
-                    vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
-                    vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
@@ -1188,13 +1234,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
+                    size_t vl = vsetvl_e16m2(n);
 
                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
-                    vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
+                    vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1232,13 +1278,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m2(n);
-
-                    vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
-                    vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
-                    vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
-                    vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
-                    vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
+                    size_t vl = vsetvl_e16m2(n);
+
+                    vuint16m2_t _p0;
+                    vuint16m2_t _p1;
+                    vuint16m2_t _p2;
+                    vuint16m2_t _p3;
+                    vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl);
+                    vse16_v_u16m2(outptr0, _p0, vl);
+                    vse16_v_u16m2(outptr1, _p1, vl);
+                    vse16_v_u16m2(outptr2, _p2, vl);
+                    vse16_v_u16m2(outptr3, _p3, vl);
 
                     r0 += vl * 4;
                     outptr0 += vl;
@@ -1280,7 +1330,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
 
                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
@@ -1290,7 +1340,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
+                    vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl;
                     r1 += vl;
@@ -1340,17 +1390,25 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
-                    vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
-                    vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
-                    vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
-                    vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
-                    vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
-                    vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
-                    vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+                    vse16_v_u16m1(outptr0, _p0, vl);
+                    vse16_v_u16m1(outptr1, _p1, vl);
+                    vse16_v_u16m1(outptr2, _p2, vl);
+                    vse16_v_u16m1(outptr3, _p3, vl);
+                    vse16_v_u16m1(outptr4, _p4, vl);
+                    vse16_v_u16m1(outptr5, _p5, vl);
+                    vse16_v_u16m1(outptr6, _p6, vl);
+                    vse16_v_u16m1(outptr7, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl;
@@ -1394,20 +1452,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p00;
+                    vuint16m1_t _p01;
+                    vuint16m1_t _p02;
+                    vuint16m1_t _p03;
+                    vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl);
 
-                    vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
-                    vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
+                    vuint16m1_t _p10;
+                    vuint16m1_t _p11;
+                    vuint16m1_t _p12;
+                    vuint16m1_t _p13;
+                    vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl);
 
-                    vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
-                    vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
-                    vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
-                    vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
-                    vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
-                    vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
-                    vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
-                    vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
-                    vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
+                    vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl);
 
                     r0 += vl * 4;
                     r1 += vl * 4;
@@ -1447,19 +1506,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
                 int n = size;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m1(n);
-
-                    vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
-                    vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
-                    vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
-                    vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
-                    vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
-                    vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
-                    vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
-                    vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
-                    vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
-                    vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
-                    vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
+                    size_t vl = vsetvl_e16m1(n);
+
+                    vuint16m1_t _p0;
+                    vuint16m1_t _p1;
+                    vuint16m1_t _p2;
+                    vuint16m1_t _p3;
+                    vuint16m1_t _p4;
+                    vuint16m1_t _p5;
+                    vuint16m1_t _p6;
+                    vuint16m1_t _p7;
+                    vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl);
+
+                    vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl);
+                    vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl);
 
                     r0 += vl * 8;
                     outptr0 += vl * 4;
diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h
index 1f93ecfe92d..50f5efe1216 100644
--- a/src/layer/riscv/padding_packn.h
+++ b/src/layer/riscv/padding_packn.h
@@ -16,7 +16,7 @@
     static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
@@ -65,7 +65,7 @@
     static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                      \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
@@ -144,7 +144,7 @@
     static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right)                        \
     {                                                                                                                                       \
         const int packn = csrr_vlenb() / sizeof(T);                                                                                         \
-        const word_type vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                 \
+        const size_t vl = vsetvl_e##SEW##m##LMUL(packn);                                                                                    \
                                                                                                                                             \
         const T* ptr = src;                                                                                                                 \
         T* outptr = dst;                                                                                                                    \
diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp
index de29af0f6bf..8f4b54da590 100644
--- a/src/layer/riscv/padding_riscv.cpp
+++ b/src/layer/riscv/padding_riscv.cpp
@@ -91,7 +91,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -261,7 +261,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -511,7 +511,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt
 {
 #if __riscv_vector
     const int packn = csrr_vlenb() / 1;
-    const word_type vl = vsetvl_e8m1(packn);
+    const size_t vl = vsetvl_e8m1(packn);
 #endif
 
     int w = bottom_blob.w;
diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp
index 0ca4e3d894c..1b4c1f0ed8a 100644
--- a/src/layer/riscv/pooling_riscv.cpp
+++ b/src/layer/riscv/pooling_riscv.cpp
@@ -72,7 +72,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option&
 
 #if __riscv_vector
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 #endif
 
     int w = bottom_blob.w;
@@ -315,7 +315,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op
     // avg value in NxN window
 
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
@@ -721,7 +721,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O
     }
 
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int w = bottom_blob.w;
     int h = bottom_blob.h;
diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp
index c25223461a1..32cb77023b4 100644
--- a/src/layer/riscv/prelu_riscv.cpp
+++ b/src/layer/riscv/prelu_riscv.cpp
@@ -63,7 +63,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -84,7 +84,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -115,7 +115,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n);
+                        size_t vl = vsetvl_e32m8(n);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
 
@@ -135,7 +135,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -170,7 +170,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e32m8(n1);
+                        size_t vl = vsetvl_e32m8(n1);
                         vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
 
@@ -191,7 +191,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -303,7 +303,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
 
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                 vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
@@ -324,7 +324,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                 vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -355,7 +355,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n);
+                        size_t vl = vsetvl_e16m4(n);
                         vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl);
 
@@ -375,7 +375,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m4(n);
+                    size_t vl = vsetvl_e16m4(n);
                     vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
 
@@ -410,7 +410,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n1);
+                        size_t vl = vsetvl_e16m4(n1);
                         vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
                         vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl);
 
@@ -431,7 +431,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m4(n);
+                    size_t vl = vsetvl_e16m4(n);
                     vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
 
                     vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl);
@@ -468,7 +468,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m4(n);
+                size_t vl = vsetvl_e16m4(n);
                 vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                 vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
                 vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl);
@@ -489,7 +489,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
             // #pragma omp parallel for num_threads(opt.num_threads)
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                 vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
 
@@ -520,7 +520,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
 
                     while (n > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n);
+                        size_t vl = vsetvl_e16m4(n);
                         vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl);
 
@@ -540,7 +540,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                     vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
 
@@ -575,7 +575,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                     const float* slope_ptr = (const float*)slope_data + q * elempack;
                     while (n1 > 0)
                     {
-                        word_type vl = vsetvl_e16m4(n1);
+                        size_t vl = vsetvl_e16m4(n1);
                         vfloat16m4_t _p = vle16_v_f16m4(ptr, vl);
                         vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl);
 
@@ -596,7 +596,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
                 float slope = num_slope > 1 ? slope_data[q] : slope_data[0];
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e16m8(n);
+                    size_t vl = vsetvl_e16m8(n);
                     vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
 
                     vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl);
diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp
index 6b23ebc3a63..cf2d4057069 100644
--- a/src/layer/riscv/relu_riscv.cpp
+++ b/src/layer/riscv/relu_riscv.cpp
@@ -58,10 +58,10 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
-                _p = vfmax_vf_f32m8(_p, (float32_t)0.f, vl);
+                _p = vfmax_vf_f32m8(_p, 0.f, vl);
                 vse32_v_f32m8(ptr, _p, vl);
 
                 ptr += vl;
@@ -82,7 +82,7 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 _p = vfmul_vf_f32m8_m(vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); //slope: float(float32_t)
@@ -124,10 +124,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
             int n = size;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
 
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
-                _p = vfmax_vf_f16m8(_p, (float16_t)0.f, vl);
+                _p = vfmax_vf_f16m8(_p, (__fp16)0.f, vl);
                 vse16_v_f16m8(ptr, _p, vl);
 
                 ptr += vl;
@@ -137,10 +137,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         else
         {
             int n = size;
-            float16_t _slope = (float16_t)slope;
+            __fp16 _slope = (__fp16)slope;
             while (n > 0)
             {
-                word_type vl = vsetvl_e16m8(n);
+                size_t vl = vsetvl_e16m8(n);
 
                 vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
                 _p = vfmul_vf_f16m8_m(vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl);
diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h
index 763e719b15d..d5f114f3aaa 100644
--- a/src/layer/riscv/riscv_activation.h
+++ b/src/layer/riscv/riscv_activation.h
@@ -22,49 +22,49 @@
 #include "rvv_mathfun.h"
 #include "rvv_mathfun_fp16s.h"
 
-#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN)                                                                                                            \
-    static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, word_type vl) \
-    {                                                                                                                                                        \
-        if (activation_type == 1)                                                                                                                            \
-        {                                                                                                                                                    \
-            _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl);                                                                                                      \
-        }                                                                                                                                                    \
-        else if (activation_type == 2)                                                                                                                       \
-        {                                                                                                                                                    \
-            vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl);                                                                       \
-            _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl);                                                                    \
-        }                                                                                                                                                    \
-        else if (activation_type == 3)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl);                                                                                     \
-            _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl);                                                                                     \
-        }                                                                                                                                                    \
-        else if (activation_type == 4)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = sigmoid_ps(_v, vl);                                                                                                                         \
-        }                                                                                                                                                    \
-        else if (activation_type == 5)                                                                                                                       \
-        {                                                                                                                                                    \
-            _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl);                               \
-        }                                                                                                                                                    \
-        else if (activation_type == 6)                                                                                                                       \
-        {                                                                                                                                                    \
-            const float alpha = activation_params[0];                                                                                                        \
-            const float beta = activation_params[1];                                                                                                         \
-            const float lower = -beta / alpha;                                                                                                               \
-            const float upper = (1.f / alpha) + lower;                                                                                                       \
-            vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl);                                                                      \
-            vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl);                                                                     \
-            vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl);                                                                                  \
-            _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl);                                                                                           \
-                                                                                                                                                             \
-            vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m(                                                                                     \
-                _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta,                                                           \
-                vl);                                                                                                                                         \
-            _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl);                                                                              \
-        }                                                                                                                                                    \
-                                                                                                                                                             \
-        return _v;                                                                                                                                           \
+#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN)                                                                                                         \
+    static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \
+    {                                                                                                                                                     \
+        if (activation_type == 1)                                                                                                                         \
+        {                                                                                                                                                 \
+            _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl);                                                                                                   \
+        }                                                                                                                                                 \
+        else if (activation_type == 2)                                                                                                                    \
+        {                                                                                                                                                 \
+            vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl);                                                                    \
+            _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl);                                                                 \
+        }                                                                                                                                                 \
+        else if (activation_type == 3)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl);                                                                                  \
+            _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl);                                                                                  \
+        }                                                                                                                                                 \
+        else if (activation_type == 4)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = sigmoid_ps(_v, vl);                                                                                                                      \
+        }                                                                                                                                                 \
+        else if (activation_type == 5)                                                                                                                    \
+        {                                                                                                                                                 \
+            _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl);                            \
+        }                                                                                                                                                 \
+        else if (activation_type == 6)                                                                                                                    \
+        {                                                                                                                                                 \
+            const float alpha = activation_params[0];                                                                                                     \
+            const float beta = activation_params[1];                                                                                                      \
+            const float lower = -beta / alpha;                                                                                                            \
+            const float upper = (1.f / alpha) + lower;                                                                                                    \
+            vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl);                                                                   \
+            vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl);                                                                  \
+            vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl);                                                                               \
+            _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl);                                                                                        \
+                                                                                                                                                          \
+            vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m(                                                                                  \
+                _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta,                                                        \
+                vl);                                                                                                                                      \
+            _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl);                                                                           \
+        }                                                                                                                                                 \
+                                                                                                                                                          \
+        return _v;                                                                                                                                        \
     }
 
 _RVV_FLOAT_ACTIVATION_PS(16, 1, 16)
diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h
index f60faad50f7..596bf4435c6 100644
--- a/src/layer/riscv/riscv_usability.h
+++ b/src/layer/riscv/riscv_usability.h
@@ -53,7 +53,7 @@ static inline int csrr_vlenb()
 static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 {
     const int packn = csrr_vlenb() / 4;
-    const word_type vl = vsetvl_e32m8(packn * 8);
+    const size_t vl = vsetvl_e32m8(packn * 8);
 
     // NOTE vloxei8_v_f32m8 gets illegal instruction on d1  --- nihui
 
@@ -90,7 +90,7 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr)
 static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 {
     const int packn = csrr_vlenb() / 2;
-    const word_type vl = vsetvl_e16m8(packn * 8);
+    const size_t vl = vsetvl_e16m8(packn * 8);
 
     // NOTE vloxei8_v_f16m8 gets illegal instruction on d1  --- nihui
 
@@ -125,4 +125,278 @@ static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr)
 #endif // __riscv_zfh
 #endif // __riscv_vector
 
+#if __riscv_vector && __rvv_tuple
+
+// f32m1, vsseg.v
+static inline void vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e32_v_f32m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3);
+    vsseg4e32_v_f32m1x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1);
+    vsseg2e32_v_f32m1x2(base, _tmp, vl);
+}
+
+// f32m1, vssseg.v, 8/4/2
+static inline void vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3);
+    vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1);
+    vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl);
+}
+
+// f32m2, vsseg.v, 4/2
+static inline void vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl)
+{
+    vfloat32m2x4_t _tmp = vcreate_f32m2x4(v0, v1, v2, v3);
+    vsseg4e32_v_f32m2x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl)
+{
+    vfloat32m2x2_t _tmp = vcreate_f32m2x2(v0, v1);
+    vsseg2e32_v_f32m2x2(base, _tmp, vl);
+}
+
+// u16m1, vsseg.v, 8/4
+static inline void vsseg8e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7, size_t vl)
+{
+    vuint16m1x8_t _tmp = vcreate_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e16_v_u16m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, size_t vl)
+{
+    vuint16m1x4_t _tmp = vcreate_u16m1x4(v0, v1, v2, v3);
+    vsseg4e16_v_u16m1x4(base, _tmp, vl);
+}
+
+// u16m2, vsseg.v, 4/2
+static inline void vsseg4e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3, size_t vl)
+{
+    vuint16m2x4_t _tmp = vcreate_u16m2x4(v0, v1, v2, v3);
+    vsseg4e16_v_u16m2x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, size_t vl)
+{
+    vuint16m2x2_t _tmp = vcreate_u16m2x2(v0, v1);
+    vsseg2e16_v_u16m2x2(base, _tmp, vl);
+}
+
+// f32m1, vlseg.v 8/4/2
+static inline void vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl)
+{
+    vfloat32m1x8_t _tmp = vlseg8e32_v_f32m1x8(base, vl);
+    *v0 = vget_f32m1x8_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x8_f32m1(_tmp, 1);
+    *v2 = vget_f32m1x8_f32m1(_tmp, 2);
+    *v3 = vget_f32m1x8_f32m1(_tmp, 3);
+    *v4 = vget_f32m1x8_f32m1(_tmp, 4);
+    *v5 = vget_f32m1x8_f32m1(_tmp, 5);
+    *v6 = vget_f32m1x8_f32m1(_tmp, 6);
+    *v7 = vget_f32m1x8_f32m1(_tmp, 7);
+}
+
+static inline void vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl)
+{
+    vfloat32m1x4_t _tmp = vlseg4e32_v_f32m1x4(base, vl);
+    *v0 = vget_f32m1x4_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x4_f32m1(_tmp, 1);
+    *v2 = vget_f32m1x4_f32m1(_tmp, 2);
+    *v3 = vget_f32m1x4_f32m1(_tmp, 3);
+}
+
+static inline void vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl)
+{
+    vfloat32m1x2_t _tmp = vlseg2e32_v_f32m1x2(base, vl);
+    *v0 = vget_f32m1x2_f32m1(_tmp, 0);
+    *v1 = vget_f32m1x2_f32m1(_tmp, 1);
+}
+
+// f32m2, vlseg.v, 4
+static inline void vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl)
+{
+    vfloat32m2x4_t _tmp = vlseg4e32_v_f32m2x4(base, vl);
+    *v0 = vget_f32m2x4_f32m2(_tmp, 0);
+    *v1 = vget_f32m2x4_f32m2(_tmp, 1);
+    *v2 = vget_f32m2x4_f32m2(_tmp, 2);
+    *v3 = vget_f32m2x4_f32m2(_tmp, 3);
+}
+
+// f32m4, vlseg.v, 2
+static inline void vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl)
+{
+    vfloat32m4x2_t _tmp = vlseg2e32_v_f32m4x2(base, vl);
+    *v0 = vget_f32m4x2_f32m4(_tmp, 0);
+    *v1 = vget_f32m4x2_f32m4(_tmp, 1);
+}
+
+// f32m4, vloxseg.v
+static inline void vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl)
+{
+    vfloat32m4x2_t _tmp = vloxseg2ei32_v_f32m4x2(base, bindex, vl);
+    *v0 = vget_f32m4x2_f32m4(_tmp, 0);
+    *v1 = vget_f32m4x2_f32m4(_tmp, 1);
+}
+
+// u16m1, vlseg.v 8/4/2
+static inline void vlseg8e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, vuint16m1_t* v4, vuint16m1_t* v5, vuint16m1_t* v6, vuint16m1_t* v7, const uint16_t* base, size_t vl)
+{
+    vuint16m1x8_t _tmp = vlseg8e16_v_u16m1x8(base, vl);
+    *v0 = vget_u16m1x8_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x8_u16m1(_tmp, 1);
+    *v2 = vget_u16m1x8_u16m1(_tmp, 2);
+    *v3 = vget_u16m1x8_u16m1(_tmp, 3);
+    *v4 = vget_u16m1x8_u16m1(_tmp, 4);
+    *v5 = vget_u16m1x8_u16m1(_tmp, 5);
+    *v6 = vget_u16m1x8_u16m1(_tmp, 6);
+    *v7 = vget_u16m1x8_u16m1(_tmp, 7);
+}
+
+static inline void vlseg4e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, const uint16_t* base, size_t vl)
+{
+    vuint16m1x4_t _tmp = vlseg4e16_v_u16m1x4(base, vl);
+    *v0 = vget_u16m1x4_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x4_u16m1(_tmp, 1);
+    *v2 = vget_u16m1x4_u16m1(_tmp, 2);
+    *v3 = vget_u16m1x4_u16m1(_tmp, 3);
+}
+
+static inline void vlseg2e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, const uint16_t* base, size_t vl)
+{
+    vuint16m1x2_t _tmp = vlseg2e16_v_u16m1x2(base, vl);
+    *v0 = vget_u16m1x2_u16m1(_tmp, 0);
+    *v1 = vget_u16m1x2_u16m1(_tmp, 1);
+}
+
+// u16m2, vlseg.v, 4
+static inline void vlseg4e16_v_u16m2(vuint16m2_t* v0, vuint16m2_t* v1, vuint16m2_t* v2, vuint16m2_t* v3, const uint16_t* base, size_t vl)
+{
+    vuint16m2x4_t _tmp = vlseg4e16_v_u16m2x4(base, vl);
+    *v0 = vget_u16m2x4_u16m2(_tmp, 0);
+    *v1 = vget_u16m2x4_u16m2(_tmp, 1);
+    *v2 = vget_u16m2x4_u16m2(_tmp, 2);
+    *v3 = vget_u16m2x4_u16m2(_tmp, 3);
+}
+
+// u16m4, vlseg.v, 2
+static inline void vlseg2e16_v_u16m4(vuint16m4_t* v0, vuint16m4_t* v1, const uint16_t* base, size_t vl)
+{
+    vuint16m4x2_t _tmp = vlseg2e16_v_u16m4x2(base, vl);
+    *v0 = vget_u16m4x2_u16m4(_tmp, 0);
+    *v1 = vget_u16m4x2_u16m4(_tmp, 1);
+}
+
+#if __riscv_zfh
+
+// f16m1, vsseg.v, 8/4/2
+static inline void vsseg8e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vsseg8e16_v_f16m1x8(base, _tmp, vl);
+}
+
+static inline void vsseg4e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3);
+    vsseg4e16_v_f16m1x4(base, _tmp, vl);
+}
+
+static inline void vsseg2e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1);
+    vsseg2e16_v_f16m1x2(base, _tmp, vl);
+}
+
+// f16m1, vssseg.v, 8/4/2
+static inline void vssseg8e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7);
+    vssseg8e16_v_f16m1x8(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg4e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3);
+    vssseg4e16_v_f16m1x4(base, bstride, _tmp, vl);
+}
+
+static inline void vssseg2e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1);
+    vssseg2e16_v_f16m1x2(base, bstride, _tmp, vl);
+}
+
+// f16m1, vlseg.v 8/4/2
+static inline void vlseg8e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, vfloat16m1_t* v4, vfloat16m1_t* v5, vfloat16m1_t* v6, vfloat16m1_t* v7, const float16_t* base, size_t vl)
+{
+    vfloat16m1x8_t _tmp = vlseg8e16_v_f16m1x8(base, vl);
+    *v0 = vget_f16m1x8_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x8_f16m1(_tmp, 1);
+    *v2 = vget_f16m1x8_f16m1(_tmp, 2);
+    *v3 = vget_f16m1x8_f16m1(_tmp, 3);
+    *v4 = vget_f16m1x8_f16m1(_tmp, 4);
+    *v5 = vget_f16m1x8_f16m1(_tmp, 5);
+    *v6 = vget_f16m1x8_f16m1(_tmp, 6);
+    *v7 = vget_f16m1x8_f16m1(_tmp, 7);
+}
+
+static inline void vlseg4e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, const float16_t* base, size_t vl)
+{
+    vfloat16m1x4_t _tmp = vlseg4e16_v_f16m1x4(base, vl);
+    *v0 = vget_f16m1x4_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x4_f16m1(_tmp, 1);
+    *v2 = vget_f16m1x4_f16m1(_tmp, 2);
+    *v3 = vget_f16m1x4_f16m1(_tmp, 3);
+}
+
+static inline void vlseg2e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, const float16_t* base, size_t vl)
+{
+    vfloat16m1x2_t _tmp = vlseg2e16_v_f16m1x2(base, vl);
+    *v0 = vget_f16m1x2_f16m1(_tmp, 0);
+    *v1 = vget_f16m1x2_f16m1(_tmp, 1);
+}
+
+// f16m2, vlseg.v, 4
+static inline void vlseg4e16_v_f16m2(vfloat16m2_t* v0, vfloat16m2_t* v1, vfloat16m2_t* v2, vfloat16m2_t* v3, const float16_t* base, size_t vl)
+{
+    vfloat16m2x4_t _tmp = vlseg4e16_v_f16m2x4(base, vl);
+    *v0 = vget_f16m2x4_f16m2(_tmp, 0);
+    *v1 = vget_f16m2x4_f16m2(_tmp, 1);
+    *v2 = vget_f16m2x4_f16m2(_tmp, 2);
+    *v3 = vget_f16m2x4_f16m2(_tmp, 3);
+}
+
+// f16m4, vlseg.v, 2
+static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const float16_t* base, size_t vl)
+{
+    vfloat16m4x2_t _tmp = vlseg2e16_v_f16m4x2(base, vl);
+    *v0 = vget_f16m4x2_f16m4(_tmp, 0);
+    *v1 = vget_f16m4x2_f16m4(_tmp, 1);
+}
+
+#endif // __riscv_zfh
+#endif // __riscv_vector
+
 #endif // RISCV_USABILITY_H
diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h
index 8993b5ad8e6..aa966de6c86 100644
--- a/src/layer/riscv/rvv_mathfun.h
+++ b/src/layer/riscv/rvv_mathfun.h
@@ -32,7 +32,7 @@
 #define c_cephes_log_q2 0.693359375
 
 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN)                                                                              \
-    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl)                                    \
+    static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl)                                       \
     {                                                                                                                \
         x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */                            \
         vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl);                                    \
@@ -118,7 +118,7 @@ _RVV_FLOAT32_LOG_OP(8, 4)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN)                                                   \
-    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl)         \
+    static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl)            \
     {                                                                                     \
         vfloat32m##LMUL##_t tmp, fx;                                                      \
                                                                                           \
@@ -184,7 +184,7 @@ _RVV_FLOAT32_EXP_OP(8, 4)
 #define c_cephes_FOPI      1.27323954473516 // 4 / M_PI
 
 #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN)                                                                                          \
-    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, word_type vl)         \
+    static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, size_t vl)            \
     {                                                                                                                               \
         /* any x */                                                                                                                 \
         vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y;                                                                                    \
@@ -257,12 +257,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16)
 _RVV_FLOAT32_SINCOS_OP(4, 8)
 _RVV_FLOAT32_SINCOS_OP(8, 4)
 
-#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ysin;                                                              \
+#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ysin;                                                           \
     }
 
 _RVV_FLOAT32_SIN_OP(1, 32)
@@ -270,12 +270,12 @@ _RVV_FLOAT32_SIN_OP(2, 16)
 _RVV_FLOAT32_SIN_OP(4, 8)
 _RVV_FLOAT32_SIN_OP(8, 4)
 
-#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                           \
-    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat32m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ycos;                                                              \
+#define _RVV_FLOAT32_COS_OP(LMUL, MLEN)                                        \
+    static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat32m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ycos;                                                           \
     }
 
 _RVV_FLOAT32_COS_OP(1, 32)
@@ -293,7 +293,7 @@ _RVV_FLOAT32_COS_OP(8, 4)
 #define c_cephes_tanh_p4 -3.33332819422E-1
 
 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN)                                                                                              \
-    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl)                                                    \
+    static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl)                                                       \
     {                                                                                                                                 \
         vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl);                                                                    \
                                                                                                                                       \
@@ -341,11 +341,11 @@ _RVV_FLOAT32_TANH_OP(2, 16)
 _RVV_FLOAT32_TANH_OP(4, 8)
 _RVV_FLOAT32_TANH_OP(8, 4)
 
-#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                                  \
-    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \
-    {                                                                                                    \
-        /* pow(x, m) = exp(m * log(x)) */                                                                \
-        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                    \
+#define _RVV_FLOAT32_POW_OP(LMUL, MLEN)                                                               \
+    static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \
+    {                                                                                                 \
+        /* pow(x, m) = exp(m * log(x)) */                                                             \
+        return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl);                                 \
     }
 
 _RVV_FLOAT32_POW_OP(1, 32)
@@ -354,7 +354,7 @@ _RVV_FLOAT32_POW_OP(4, 8)
 _RVV_FLOAT32_POW_OP(8, 4)
 
 #define _RVV_FLOAT32_SIGMOID_OP(LMUL, MLEN)                                                                                                \
-    static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, word_type vl)                                                     \
+    static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, size_t vl)                                                        \
     {                                                                                                                                      \
         _v = vfneg_v_f32m##LMUL(_v, vl);                                                                                                   \
         _v = exp_ps(_v, vl);                                                                                                               \
@@ -447,8 +447,8 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4)
 #define c_erfc_sb7 -2.2440952301e+01f /* 0xc1b38712 */
 
 #define _RVV_FLOAT32_FMA_HELPER(LMUL)                                                                     \
-    static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float32_t b,           \
-                                                            float32_t c, word_type vl)                    \
+    static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float b,               \
+                                                            float c, size_t vl)                           \
     {                                                                                                     \
         vfloat32m##LMUL##_t ret = vfmul_vf_f32m##LMUL(a, b, vl);                                          \
         ret = vfadd_vf_f32m##LMUL(ret, c, vl);                                                            \
@@ -456,7 +456,7 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4)
     }                                                                                                     \
                                                                                                           \
     static inline vfloat32m##LMUL##_t vfmadd_vvf_f32m##LMUL(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, \
-                                                            float32_t c, word_type vl)                    \
+                                                            float c, size_t vl)                           \
     {                                                                                                     \
         vfloat32m##LMUL##_t ret = vfmul_vv_f32m##LMUL(a, b, vl);                                          \
         ret = vfadd_vf_f32m##LMUL(ret, c, vl);                                                            \
@@ -469,7 +469,7 @@ _RVV_FLOAT32_FMA_HELPER(2)
 _RVV_FLOAT32_FMA_HELPER(1)
 
 #define _RVV_FLOAT32_ERFC_OP(LMUL, MLEN)                                                                                                                                                                                                                                                                                                           \
-    static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, word_type vl)                                                                                                                                                                                                                                                                 \
+    static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, size_t vl)                                                                                                                                                                                                                                                                    \
     {                                                                                                                                                                                                                                                                                                                                              \
         /* Argument for polys */                                                                                                                                                                                                                                                                                                                   \
         vfloat32m##LMUL##_t absx = vfsgnjx_vv_f32m##LMUL(x, x, vl);                                                                                                                                                                                                                                                                                \
diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h
index 129a4f94037..e7f18b961ae 100644
--- a/src/layer/riscv/rvv_mathfun_fp16s.h
+++ b/src/layer/riscv/rvv_mathfun_fp16s.h
@@ -32,7 +32,7 @@
 #define c_cephes_log_q2     0.693359375
 
 #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN)                                                                          \
-    static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, word_type vl)                                \
+    static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl)                                   \
     {                                                                                                            \
         x = vfmax_vf_f16m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */                        \
         vbool##MLEN##_t invalid_mask = vmfle_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl);                                \
@@ -118,7 +118,7 @@ _RVV_FLOAT16_LOG_OP(8, 2)
 #define c_cephes_exp_p5 5.0000001201E-1
 
 #define _RVV_FLOAT16_EXP_OP(LMUL, MLEN)                                                   \
-    static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, word_type vl)         \
+    static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl)            \
     {                                                                                     \
         vfloat16m##LMUL##_t tmp, fx;                                                      \
                                                                                           \
@@ -184,7 +184,7 @@ _RVV_FLOAT16_EXP_OP(8, 2)
 #define c_cephes_FOPI      1.27323954473516 // 4 / M_PI
 
 #define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN)                                                                                          \
-    static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, word_type vl)         \
+    static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl)            \
     {                                                                                                                               \
         /* any x */                                                                                                                 \
         vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y;                                                                                    \
@@ -257,12 +257,12 @@ _RVV_FLOAT16_SINCOS_OP(2, 8)
 _RVV_FLOAT16_SINCOS_OP(4, 4)
 _RVV_FLOAT16_SINCOS_OP(8, 2)
 
-#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN)                                           \
-    static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat16m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ysin;                                                              \
+#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN)                                        \
+    static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat16m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ysin;                                                           \
     }
 
 _RVV_FLOAT16_SIN_OP(1, 16)
@@ -270,12 +270,12 @@ _RVV_FLOAT16_SIN_OP(2, 8)
 _RVV_FLOAT16_SIN_OP(4, 4)
 _RVV_FLOAT16_SIN_OP(8, 2)
 
-#define _RVV_FLOAT16_COS_OP(LMUL, MLEN)                                           \
-    static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, word_type vl) \
-    {                                                                             \
-        vfloat16m##LMUL##_t ysin, ycos;                                           \
-        sincos_ps(x, &ysin, &ycos, vl);                                           \
-        return ycos;                                                              \
+#define _RVV_FLOAT16_COS_OP(LMUL, MLEN)                                        \
+    static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, size_t vl) \
+    {                                                                          \
+        vfloat16m##LMUL##_t ysin, ycos;                                        \
+        sincos_ps(x, &ysin, &ycos, vl);                                        \
+        return ycos;                                                           \
     }
 
 _RVV_FLOAT16_COS_OP(1, 16)
@@ -293,7 +293,7 @@ _RVV_FLOAT16_COS_OP(8, 2)
 #define c_cephes_tanh_p4 -3.33332819422E-1
 
 #define _RVV_FLOAT16_TANH_OP(LMUL, MLEN)                                                                                              \
-    static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, word_type vl)                                                    \
+    static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl)                                                       \
     {                                                                                                                                 \
         vfloat16m##LMUL##_t x2 = vfsgnj_vf_f16m##LMUL(x, 1.f, vl);                                                                    \
                                                                                                                                       \
@@ -341,11 +341,11 @@ _RVV_FLOAT16_TANH_OP(2, 8)
 _RVV_FLOAT16_TANH_OP(4, 4)
 _RVV_FLOAT16_TANH_OP(8, 2)
 
-#define _RVV_FLOAT16_POW_OP(LMUL, MLEN)                                                                  \
-    static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, word_type vl) \
-    {                                                                                                    \
-        /* pow(x, m) = exp(m * log(x)) */                                                                \
-        return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl);                                    \
+#define _RVV_FLOAT16_POW_OP(LMUL, MLEN)                                                               \
+    static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, size_t vl) \
+    {                                                                                                 \
+        /* pow(x, m) = exp(m * log(x)) */                                                             \
+        return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl);                                 \
     }
 
 _RVV_FLOAT16_POW_OP(1, 16)
@@ -354,7 +354,7 @@ _RVV_FLOAT16_POW_OP(4, 4)
 _RVV_FLOAT16_POW_OP(8, 2)
 
 #define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN)                                                                                                \
-    static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, word_type vl)                                                     \
+    static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl)                                                        \
     {                                                                                                                                      \
         _v = vfneg_v_f16m##LMUL(_v, vl);                                                                                                   \
         _v = exp_ps(_v, vl);                                                                                                               \
diff --git a/src/layer/riscv/selu_riscv.cpp b/src/layer/riscv/selu_riscv.cpp
index 9a4939c8421..932db355cc2 100644
--- a/src/layer/riscv/selu_riscv.cpp
+++ b/src/layer/riscv/selu_riscv.cpp
@@ -39,7 +39,7 @@ int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             vbool4_t _lower = vmflt_vf_f32m8_b4(_p, 0.f, vl);
             vbool4_t _higher = vmnot_m_b4(_lower, vl);
diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp
index afd07ea2b38..6c10582c668 100644
--- a/src/layer/riscv/sigmoid_riscv.cpp
+++ b/src/layer/riscv/sigmoid_riscv.cpp
@@ -64,7 +64,7 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = sigmoid_ps(_p, vl);
@@ -104,7 +104,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = sigmoid_ps(_p, vl);
@@ -135,7 +135,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = sigmoid_ps(_p, vl);
diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp
index 7a93e5de18d..ca910c3d3c0 100644
--- a/src/layer/riscv/softmax_riscv.cpp
+++ b/src/layer/riscv/softmax_riscv.cpp
@@ -44,7 +44,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         float* ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
             vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
@@ -61,7 +61,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
             vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
 
@@ -80,7 +80,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
         ptr_vol = ptr;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl);
             _p = vfdiv_vf_f32m8(_p, sum, vl);
@@ -112,7 +112,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = w * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
@@ -141,7 +141,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
@@ -168,7 +168,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = w * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
 
@@ -198,7 +198,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr1 = ptr;
             while (n1 > 0)
             {
-                word_type vl = vsetvl_e32m8(n1);
+                size_t vl = vsetvl_e32m8(n1);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl);
                 vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl);
 
@@ -215,7 +215,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr2 = ptr;
             while (n2 > 0)
             {
-                word_type vl = vsetvl_e32m8(n2);
+                size_t vl = vsetvl_e32m8(n2);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl);
                 vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl);
 
@@ -233,7 +233,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             float* ptr3 = ptr;
             while (n3 > 0)
             {
-                word_type vl = vsetvl_e32m8(n3);
+                size_t vl = vsetvl_e32m8(n3);
 
                 vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl);
 
@@ -269,7 +269,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
 
                 vfloat32m8_t _max = vle32_v_f32m8(max, vl);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
@@ -295,7 +295,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
@@ -319,7 +319,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
             int n = size * elempack;
             while (n > 0)
             {
-                word_type vl = vsetvl_e32m8(n);
+                size_t vl = vsetvl_e32m8(n);
                 vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                 vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl);
 
@@ -358,7 +358,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
 
@@ -392,7 +392,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
 
                 while (n)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl);
                     vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
@@ -422,7 +422,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 int n = w * elempack;
                 while (n > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n);
+                    size_t vl = vsetvl_e32m8(n);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
                     vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl);
 
@@ -457,7 +457,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_1 = ptr;
                 while (n1 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n1);
+                    size_t vl = vsetvl_e32m8(n1);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl);
                     vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl);
                     _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl);
@@ -473,7 +473,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_2 = ptr;
                 while (n2 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n2);
+                    size_t vl = vsetvl_e32m8(n2);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl);
                     vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl);
 
@@ -491,7 +491,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons
                 float* ptr_3 = ptr;
                 while (n3 > 0)
                 {
-                    word_type vl = vsetvl_e32m8(n3);
+                    size_t vl = vsetvl_e32m8(n3);
                     vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl);
 
                     _p = vfdiv_vf_f32m8(_p, sum, vl);
diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp
index f12ab157ae9..17493d7db69 100644
--- a/src/layer/riscv/swish_riscv.cpp
+++ b/src/layer/riscv/swish_riscv.cpp
@@ -64,7 +64,7 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl);
@@ -103,7 +103,7 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl);
@@ -134,7 +134,7 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = vfdiv_vv_f16m8(_p, vfadd_vf_f16m8(exp_ps(vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl);
diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp
index b0f0cafe7d7..d47de61dc59 100644
--- a/src/layer/riscv/tanh_riscv.cpp
+++ b/src/layer/riscv/tanh_riscv.cpp
@@ -64,7 +64,7 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = tanh_ps(_p, vl);
@@ -103,7 +103,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m4(n);
+            size_t vl = vsetvl_e16m4(n);
 
             vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl);
             _p = tanh_ps(_p, vl);
@@ -134,7 +134,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt)
         int n = size;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = tanh_ps(_p, vl);
diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp
index 62c6a52740b..e5eb80151b1 100644
--- a/src/layer/riscv/unaryop_riscv.cpp
+++ b/src/layer/riscv/unaryop_riscv.cpp
@@ -55,7 +55,7 @@ static int unary_op_inplace(Mat& a, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e32m8(n);
+            size_t vl = vsetvl_e32m8(n);
 
             vfloat32m8_t _p = vle32_v_f32m8(ptr, vl);
             _p = op(_p, vl);
@@ -73,7 +73,7 @@ namespace UnaryOp_riscv_functor {
 
 struct unary_op_abs
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfsgnj_vf_f32m8(x, 1.f, vl);
     }
@@ -81,7 +81,7 @@ struct unary_op_abs
 
 struct unary_op_neg
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfneg_v_f32m8(x, vl);
     }
@@ -89,7 +89,7 @@ struct unary_op_neg
 
 struct unary_op_floor
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
         vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
@@ -99,7 +99,7 @@ struct unary_op_floor
 
 struct unary_op_ceil
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl);
         vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl);
@@ -109,7 +109,7 @@ struct unary_op_ceil
 
 struct unary_op_square
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfmul_vv_f32m8(x, x, vl);
     }
@@ -117,7 +117,7 @@ struct unary_op_square
 
 struct unary_op_sqrt
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return vfsqrt_v_f32m8(x, vl);
     }
@@ -125,7 +125,7 @@ struct unary_op_sqrt
 
 struct unary_op_rsqrt
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl);
         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
@@ -136,7 +136,7 @@ struct unary_op_rsqrt
 
 struct unary_op_exp
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return exp_ps(x, vl);
     }
@@ -144,7 +144,7 @@ struct unary_op_exp
 
 struct unary_op_log
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return log_ps(x, vl);
     }
@@ -152,7 +152,7 @@ struct unary_op_log
 
 struct unary_op_sin
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return sin_ps(x, vl);
     }
@@ -160,7 +160,7 @@ struct unary_op_sin
 
 struct unary_op_cos
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return cos_ps(x, vl);
     }
@@ -168,7 +168,7 @@ struct unary_op_cos
 
 struct unary_op_tan
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -183,7 +183,7 @@ struct unary_op_tan
 
 struct unary_op_asin
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -198,7 +198,7 @@ struct unary_op_asin
 
 struct unary_op_acos
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -213,7 +213,7 @@ struct unary_op_acos
 
 struct unary_op_atan
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<float> tmp(vl);
@@ -228,7 +228,7 @@ struct unary_op_atan
 
 struct unary_op_reciprocal
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl);
         _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
@@ -239,7 +239,7 @@ struct unary_op_reciprocal
 
 struct unary_op_tanh
 {
-    vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const
+    vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const
     {
         return tanh_ps(x, vl);
     }
@@ -338,7 +338,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt)
         int n = size * elempack;
         while (n > 0)
         {
-            word_type vl = vsetvl_e16m8(n);
+            size_t vl = vsetvl_e16m8(n);
 
             vfloat16m8_t _p = vle16_v_f16m8(ptr, vl);
             _p = op(_p, vl);
@@ -356,7 +356,7 @@ namespace UnaryOp_riscv_functor {
 
 struct unary_op_abs_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfsgnj_vf_f16m8(x, 1.f, vl);
     }
@@ -364,7 +364,7 @@ struct unary_op_abs_fp16s
 
 struct unary_op_neg_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfneg_v_f16m8(x, vl);
     }
@@ -372,7 +372,7 @@ struct unary_op_neg_fp16s
 
 struct unary_op_floor_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
         vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
@@ -382,7 +382,7 @@ struct unary_op_floor_fp16s
 
 struct unary_op_ceil_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl);
         vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl);
@@ -392,7 +392,7 @@ struct unary_op_ceil_fp16s
 
 struct unary_op_square_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfmul_vv_f16m8(x, x, vl);
     }
@@ -400,7 +400,7 @@ struct unary_op_square_fp16s
 
 struct unary_op_sqrt_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return vfsqrt_v_f16m8(x, vl);
     }
@@ -408,7 +408,7 @@ struct unary_op_sqrt_fp16s
 
 struct unary_op_rsqrt_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl);
         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl);
@@ -419,7 +419,7 @@ struct unary_op_rsqrt_fp16s
 
 struct unary_op_exp_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return exp_ps(x, vl);
     }
@@ -427,7 +427,7 @@ struct unary_op_exp_fp16s
 
 struct unary_op_log_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return log_ps(x, vl);
     }
@@ -435,7 +435,7 @@ struct unary_op_log_fp16s
 
 struct unary_op_sin_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return sin_ps(x, vl);
     }
@@ -443,7 +443,7 @@ struct unary_op_sin_fp16s
 
 struct unary_op_cos_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return cos_ps(x, vl);
     }
@@ -451,7 +451,7 @@ struct unary_op_cos_fp16s
 
 struct unary_op_tan_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -466,7 +466,7 @@ struct unary_op_tan_fp16s
 
 struct unary_op_asin_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -481,7 +481,7 @@ struct unary_op_asin_fp16s
 
 struct unary_op_acos_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -496,7 +496,7 @@ struct unary_op_acos_fp16s
 
 struct unary_op_atan_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         // TODO rvv optimize
         std::vector<__fp16> tmp(vl);
@@ -511,7 +511,7 @@ struct unary_op_atan_fp16s
 
 struct unary_op_reciprocal_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl);
         _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl);
@@ -522,7 +522,7 @@ struct unary_op_reciprocal_fp16s
 
 struct unary_op_tanh_fp16s
 {
-    vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const
+    vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const
     {
         return tanh_ps(x, vl);
     }
diff --git a/src/mat.h b/src/mat.h
index 6d7deb502a2..e534def504f 100644
--- a/src/mat.h
+++ b/src/mat.h
@@ -1071,7 +1071,7 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v)
 NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 4;
-    const word_type vl = vsetvl_e32m1(packn);
+    const size_t vl = vsetvl_e32m1(packn);
 
     int size = (int)total();
     float* ptr = (float*)data;
@@ -1085,7 +1085,7 @@ NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int size = (int)total();
     unsigned short* ptr = (unsigned short*)data;
@@ -1099,7 +1099,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 1;
-    const word_type vl = vsetvl_e8m1(packn);
+    const size_t vl = vsetvl_e8m1(packn);
 
     int size = (int)total();
     signed char* ptr = (signed char*)data;
@@ -1113,7 +1113,7 @@ NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v)
 NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v)
 {
     const int packn = cpu_riscv_vlenb() / 2;
-    const word_type vl = vsetvl_e16m1(packn);
+    const size_t vl = vsetvl_e16m1(packn);
 
     int size = (int)total();
     __fp16* ptr = (__fp16*)data;
diff --git a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake
new file mode 100644
index 00000000000..953f21aaf95
--- /dev/null
+++ b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake
@@ -0,0 +1,29 @@
+set(CMAKE_SYSTEM_NAME Generic)
+set(CMAKE_SYSTEM_PROCESSOR riscv64)
+
+if(DEFINED ENV{RISCV_ROOT_PATH})
+    file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH)
+else()
+    message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined")
+endif()
+
+set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv gnu toolchain")
+
+set(CMAKE_C_COMPILER "clang")
+set(CMAKE_CXX_COMPILER "clang++")
+set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot")
+
+set(CMAKE_C_COMPILER_TARGET "riscv64-unknown-linux-gnu")
+set(CMAKE_CXX_COMPILER_TARGET "riscv64-unknown-linux-gnu")
+
+set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# add --ld-path=${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld or append $RISCV_ROOT_PATH/bin to PATH.
+set(CMAKE_C_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc")
+set(CMAKE_CXX_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc")
+
+# cache flags
+set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")