diff --git a/.github/workflows/linux-riscv64-cpu-gcc.yml b/.github/workflows/linux-riscv64-cpu-gcc.yml index 05e0487cf54..147e01f21ad 100644 --- a/.github/workflows/linux-riscv64-cpu-gcc.yml +++ b/.github/workflows/linux-riscv64-cpu-gcc.yml @@ -117,7 +117,7 @@ jobs: #id: cache-riscv #uses: actions/cache@v3 #with: - #path: rv64gcv-install + #path: rv64gcv-install-next #key: rv64gcv-linux-install-20210504 #- name: install-riscv-build-deps @@ -132,31 +132,31 @@ jobs: #with: #repository: riscv/riscv-gnu-toolchain #path: riscv-gnu-toolchain - #ref: 28271f03bb538d926ad2889dc8ad1b0cb1b3b45c + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 #- name: checkout-riscv-gnu-toolchain-submodules #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib #git submodule update --init --recursive --depth 1 riscv-binutils #git submodule update --init --recursive --depth 1 riscv-gcc - #git submodule update --init --recursive --depth 1 riscv-glibc #git submodule update --init --recursive --depth 1 riscv-dejagnu - #git submodule update --init --recursive --depth 1 riscv-newlib #git submodule update --init --recursive --depth 1 riscv-gdb #- name: riscv-gnu-toolchain #if: steps.cache-riscv.outputs.cache-hit != 'true' #run: | #cd riscv-gnu-toolchain - #sed -i '/__OBSOLETE_MATH/d' riscv-newlib/newlib/libm/common/math_errf.c - #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install --with-arch=rv64gcv_zfh + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh #make linux #- name: riscv-strip-install #if: steps.cache-riscv.outputs.cache-hit != 'true' - #run: find $GITHUB_WORKSPACE/rv64gcv-install -type f | xargs -i strip -g {} || true + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true - name: configure - run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + run: export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next && mkdir build && cd build && cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. - name: build run: cmake --build build -j 4 @@ -164,10 +164,10 @@ jobs: run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 - name: test-vlen128 run: | export PATH=/data/action/osd/qemu-install/bin:$PATH cd build - TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install/sysroot" ctest --output-on-failure -j 4 + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/.github/workflows/linux-riscv64-cpu-gnu-clang.yml b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml new file mode 100644 index 00000000000..18ad114efa4 --- /dev/null +++ b/.github/workflows/linux-riscv64-cpu-gnu-clang.yml @@ -0,0 +1,142 @@ +name: linux-riscv64-cpu-gnu-clang +on: + push: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' + pull_request: + branches: [master] + paths: + - '.github/workflows/linux-riscv64-cpu-gnu-clang.yml' + - 'toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake' + - 'CMakeLists.txt' + - 'cmake/**' + - 'src/*' + - 'src/layer/*' + - 'src/layer/riscv/**' + - 'tests/**' +concurrency: + group: linux-riscv64-cpu-gnu-clang-${{ github.ref }} + cancel-in-progress: true +permissions: + contents: read + +jobs: + linux-gcc-riscv64-rvv: + runs-on: [self-hosted, linux, centos] + steps: + - uses: actions/checkout@v3 + + #- name: cache-qemu + #id: cache-qemu + #uses: actions/cache@v3 + #with: + #path: qemu-install + #key: qemu-riscv64-install-20220502-3 + #- name: install-qemu-build-deps + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev ninja-build + #- name: checkout-qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: qemu/qemu + #path: qemu + #ref: f5643914a9e8f79c606a76e6a9d7ea82a3fc3e65 + #- name: qemu + #if: steps.cache-qemu.outputs.cache-hit != 'true' + #run: | + #cd qemu + #wget https://raw.githubusercontent.com/nihui/ncnn-assets/master/qemu-patches/0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #patch -p1 -i 0007-linux-user-Expose-risc-v-V-isa-bit-in-get_elf_hwcap.patch + #./configure --prefix=$GITHUB_WORKSPACE/qemu-install --target-list=riscv64-linux-user --disable-system + #make -j2 + #make install + + #- name: cache-riscv + #id: cache-riscv + #uses: actions/cache@v3 + #with: + #path: rv64gcv-install-next + #key: rv64gcv-linux-install-20210504 + + #- name: install-riscv-build-deps + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #sudo apt-get update + #sudo apt-get install autoconf automake autotools-dev curl python3 libmpc-dev libmpfr-dev libgmp-dev gawk build-essential bison flex texinfo gperf libtool patchutils bc zlib1g-dev libexpat-dev device-tree-compiler + + #- name: checkout-riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #uses: actions/checkout@v3 + #with: + #repository: riscv/riscv-gnu-toolchain + #path: riscv-gnu-toolchain + #ref: da01ba455ce3802ffa84fdca3a089079996dbfc3 + #- name: checkout-riscv-gnu-toolchain-submodules + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #git submodule update --init --recursive --depth 1 glibc + #git submodule update --init --recursive --depth 1 newlib + #git submodule update --init --recursive --depth 1 riscv-binutils + #git submodule update --init --recursive --depth 1 riscv-gcc + #git submodule update --init --recursive --depth 1 riscv-dejagnu + #git submodule update --init --recursive --depth 1 riscv-gdb + #- name: riscv-gnu-toolchain + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: | + #cd riscv-gnu-toolchain + #sed -i '/__OBSOLETE_MATH/d' newlib/newlib/libm/common/math_errf.c + #./configure --prefix=$GITHUB_WORKSPACE/rv64gcv-install-next --with-arch=rv64gcv_zfh + #make linux + + #- name: riscv-strip-install + #if: steps.cache-riscv.outputs.cache-hit != 'true' + #run: find $GITHUB_WORKSPACE/rv64gcv-install-next -type f | xargs -i strip -g {} || true + + # - name: install-clang + # run: | + # wget https://github.com/llvm/llvm-project/releases/download/llvmorg-15.0.1/llvm-project-15.0.1.src.tar.xz + # tar -xf llvm-project-15.0.1.src.tar.xz + # cd llvm-project-15.0.1.src + # mkdir build + # cd build + # cmake -DCMAKE_INSTALL_PREFIX=install -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=ON -DLLVM_ENABLE_PROJECTS="clang" -DLLVM_TARGETS_TO_BUILD="RISCV" -DLLVM_INCLUDE_EXAMPLES=OFF -DLLVM_INCLUDE_TESTS=OFF ../llvm/ + # make -j16 + # make install + + - name: build + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export RISCV_ROOT_PATH=/data/action/osd/rv64gcv-install-next + export PATH=/data/action/osd/llvm-project-15.0.1.src/build/install/bin:$PATH + mkdir build && cd build + cmake -DCMAKE_TOOLCHAIN_FILE=../toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON .. + cmake --build . -j 4 + + - name: test-vlen256 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=256,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 + + - name: test-vlen128 + env: + LD_LIBRARY_PATH: /data/action/install/lib64 + run: | + export PATH=/data/action/osd/qemu-install/bin:$PATH + cd build + TESTS_EXECUTABLE_LOADER=qemu-riscv64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-cpu;rv64,v=true,Zfh=true,vlen=128,elen=64,vext_spec=v1.0;-L;/data/action/osd/rv64gcv-install-next/sysroot" ctest --output-on-failure -j 4 diff --git a/CMakeLists.txt b/CMakeLists.txt index c453d23e544..4ece7ada739 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -296,10 +296,15 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") include(CheckCXXCompilerFlag) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") - check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; word_type vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV) + check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; float _v; size_t vl; _s = vfmacc_vf_f32m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV) set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh") - check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; word_type vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_FP16) + check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZFH) + + if(NOT NCNN_COMPILER_SUPPORT_RVV_ZFH) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") + check_cxx_source_compiles("#include \nint main() { vfloat16m1_t _s, _w; __fp16 _v; size_t vl; _s = vfmacc_vf_f16m1(_s, _v, _w, vl); return 0; }" NCNN_COMPILER_SUPPORT_RVV_ZVFH) + endif() unset(CMAKE_REQUIRED_FLAGS) @@ -309,9 +314,19 @@ elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv)") if(NCNN_RVV_CHECK_VFREDSUM) include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/ncnn_check_rvv_vfredusum.cmake) endif() - if(NOT NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NOT (NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH)) message(WARNING "The compiler does not support risc-v zfh extension. Upgrading your toolchain is strongly recommended.") endif() + option(NCNN_RVV_CHECK_PLAIN_SEGMENT "check compilter about rvv segment load/store interface" ON) + if(NCNN_RVV_CHECK_PLAIN_SEGMENT) + set(CMAKE_REQUIRED_FLAGS "-march=rv64gcv") + check_cxx_source_compiles("#include \nint main() { vfloat32m1_t _s, _w; size_t vl; float src[32]={.0f}; vlseg2e32_v_f32m1(&_s, &_w, src, vl); return 0; }" NCNN_COMPILER_USE_RVV_PLAIN_SEG) + unset(CMAKE_REQUIRED_FLAGS) + endif() + if(NOT NCNN_COMPILER_USE_RVV_PLAIN_SEG) + message(WARNING "The compiler uses tuple types for segment load/store. Upgrading your toolchain is strongly recommended.") + add_definitions(-D__rvv_tuple) + endif() else() message(WARNING "The compiler does not support risc-v v extension. NCNN_RVV will be OFF.") endif() diff --git a/cmake/ncnn_add_layer.cmake b/cmake/ncnn_add_layer.cmake index 89d61823deb..8abb13331a9 100644 --- a/cmake/ncnn_add_layer.cmake +++ b/cmake/ncnn_add_layer.cmake @@ -271,8 +271,10 @@ macro(ncnn_add_layer class) endif() if(NCNN_RUNTIME_CPU AND NCNN_RVV AND NCNN_TARGET_ARCH STREQUAL "riscv") - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh") + elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) + ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16") elseif(NCNN_COMPILER_SUPPORT_RVV) ncnn_add_arch_opt_layer(${class} rvv "-march=rv64gcv") endif() diff --git a/cmake/ncnn_check_rvv_vfredusum.cmake b/cmake/ncnn_check_rvv_vfredusum.cmake index 81496a765d1..59065556356 100644 --- a/cmake/ncnn_check_rvv_vfredusum.cmake +++ b/cmake/ncnn_check_rvv_vfredusum.cmake @@ -9,7 +9,7 @@ int main(void) { float in1[4] = {-1.f,0.f,+1.f,2.f}; float out1=0; - word_type vl = vsetvl_e32m8(4); + size_t vl = vsetvl_e32m8(4); vfloat32m8_t _add = vle32_v_f32m8(in1,vl); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); _sum = vfredsum_vs_f32m8_f32m1(_sum, _add, _sum, vl); @@ -23,7 +23,7 @@ int main(void) { float in1[4] = {-1.f,0.f,+1.f,2.f}; float out1=0; - word_type vl = vsetvl_e32m8(4); + size_t vl = vsetvl_e32m8(4); vfloat32m8_t _add = vle32_v_f32m8(in1,vl); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(),out1,vl); _sum = vfredusum_vs_f32m8_f32m1(_sum, _add, _sum, vl); @@ -36,7 +36,7 @@ if(NCNN_COMPILER_USE_VFREDSUM AND NOT NCNN_COMPILER_USE_VFREDUSUM) message(WARNING "The compiler uses vfredsum. Upgrading your toolchain is strongly recommended.") foreach(LMUL 1 2 4 8) add_definitions(-Dvfredusum_vs_f32m${LMUL}_f32m1=vfredsum_vs_f32m${LMUL}_f32m1) - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH OR NCNN_COMPILER_SUPPORT_RVV_ZVFH) add_definitions(-Dvfredusum_vs_f16m${LMUL}_f16m1=vfredsum_vs_f16m${LMUL}_f16m1) endif() endforeach() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 3a851554c34..11b8573462a 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -459,8 +459,10 @@ endif() if(NCNN_TARGET_ARCH STREQUAL "riscv" AND NOT C906) if(NOT NCNN_RUNTIME_CPU AND NCNN_RVV) - if(NCNN_COMPILER_SUPPORT_RVV_FP16) + if(NCNN_COMPILER_SUPPORT_RVV_ZFH) target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh) + elseif(NCNN_COMPILER_SUPPORT_RVV_ZVFH) + target_compile_options(ncnn PRIVATE -march=rv64gcv_zfh_zvfh0p1 -menable-experimental-extensions -D__fp16=_Float16) elseif(NCNN_COMPILER_SUPPORT_RVV) target_compile_options(ncnn PRIVATE -march=rv64gcv) endif() diff --git a/src/layer/riscv/absval_riscv.cpp b/src/layer/riscv/absval_riscv.cpp index 4a41788ec9e..092a8b5d6b1 100644 --- a/src/layer/riscv/absval_riscv.cpp +++ b/src/layer/riscv/absval_riscv.cpp @@ -66,7 +66,7 @@ int AbsVal_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfabs_v_f32m8_absval(_p, vl); @@ -106,7 +106,7 @@ int AbsVal_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfabs_v_f16m8_absval(_p, vl); diff --git a/src/layer/riscv/binaryop_riscv.cpp b/src/layer/riscv/binaryop_riscv.cpp index b4e53a2c856..9858e654822 100644 --- a/src/layer/riscv/binaryop_riscv.cpp +++ b/src/layer/riscv/binaryop_riscv.cpp @@ -67,7 +67,7 @@ static int binary_op_2_3_4_20(const Mat& a, const Mat& b, Mat& c, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(a0, _p, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -108,7 +108,7 @@ static int binary_op_6_11_16_25(const Mat& a, const Mat& b, Mat& c, const Option int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, b0, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -149,7 +149,7 @@ static int binary_op_7_13_19_29(const Mat& a, const Mat& b, Mat& c, const Option int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -217,7 +217,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -252,7 +252,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * h * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -289,7 +289,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -328,7 +328,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -367,7 +367,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -400,7 +400,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, *ptr1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -436,7 +436,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -469,7 +469,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _p = vfmv_v_f_f32m8(*ptr, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -508,7 +508,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -545,7 +545,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) const float* ptr1_vol = ptr1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1_vol, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -583,7 +583,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -620,7 +620,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) const float* ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_p, _p1, vl); @@ -662,7 +662,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -699,7 +699,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -736,7 +736,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * h1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -774,7 +774,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -825,7 +825,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _outp = op(_p, _b0x, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -867,7 +867,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -899,7 +899,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -931,7 +931,7 @@ static int binary_op_rvv(const Mat& a, const Mat& b, Mat& c, const Option& opt) int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p1 = vle32_v_f32m8(ptr1, vl); vfloat32m8_t _outp = op(_a0x, _p1, vl); vse32_v_f32m8(outptr, _outp, vl); @@ -985,7 +985,7 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = op(_p, b, vl); vse32_v_f32m8(ptr, _p, vl); @@ -1000,21 +1000,21 @@ static int binary_op_scalar_rvv(Mat& a, float b, const Option& opt) namespace BinaryOp_riscv_functor { -#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV) \ - struct NAME \ - { \ - vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const word_type vl) const \ - { \ - return IMPLVV; \ - } \ - vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const word_type vl) const \ - { \ - return IMPLVS; \ - } \ - vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const word_type vl) const \ - { \ - return IMPLSV; \ - } \ +#define MAKE_FUNCTION(NAME, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + vfloat32m8_t operator()(const vfloat32m8_t& x, const vfloat32m8_t& y, const size_t vl) const \ + { \ + return IMPLVV; \ + } \ + vfloat32m8_t operator()(const vfloat32m8_t& x, const float y, const size_t vl) const \ + { \ + return IMPLVS; \ + } \ + vfloat32m8_t operator()(const float x, const vfloat32m8_t& y, const size_t vl) const \ + { \ + return IMPLSV; \ + } \ }; MAKE_FUNCTION(binary_op_add_rvv, vfadd_vv_f32m8(x, y, vl), vfadd_vf_f32m8(x, y, vl), vfadd_vf_f32m8(y, x, vl)) @@ -1159,7 +1159,7 @@ static int binary_op_2_3_4_20_fp16s(const Mat& a, const Mat& b, Mat& c, const Op int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(a0, _p, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1200,7 +1200,7 @@ static int binary_op_6_11_16_25_fp16s(const Mat& a, const Mat& b, Mat& c, const int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, b0, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1241,7 +1241,7 @@ static int binary_op_7_13_19_29_fp16s(const Mat& a, const Mat& b, Mat& c, const int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1309,7 +1309,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1344,7 +1344,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * h * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1381,7 +1381,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1420,7 +1420,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1459,7 +1459,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1492,7 +1492,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, *ptr1, vl); @@ -1527,7 +1527,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1560,7 +1560,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _p = vfmv_v_f_f16m8(*ptr, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1598,7 +1598,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1637,7 +1637,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& const __fp16* ptr1_vol = ptr1 + x * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1_vol, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1676,7 +1676,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1715,7 +1715,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& const __fp16* ptr_vol = ptr + x * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr_vol, vl); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_p, _p1, vl); @@ -1758,7 +1758,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1795,7 +1795,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1832,7 +1832,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * h1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1870,7 +1870,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1921,7 +1921,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vfloat16m8_t _outp = op(_p, _b0x, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1963,7 +1963,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -1995,7 +1995,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n1 = size1 * elempack1; while (n1 > 0) { - word_type vl = vsetvl_e16m8(n1); + size_t vl = vsetvl_e16m8(n1); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -2027,7 +2027,7 @@ static int binary_op_rvv_fp16s(const Mat& a, const Mat& b, Mat& c, const Option& int n = w1 * elempack1; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p1 = vle16_v_f16m8(ptr1, vl); vfloat16m8_t _outp = op(_a0x, _p1, vl); vse16_v_f16m8(outptr, _outp, vl); @@ -2706,7 +2706,7 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = op(_p, b, vl); vse16_v_f16m8(ptr, _p, vl); @@ -2721,25 +2721,25 @@ static int binary_op_scalar_rvv_fp16s(Mat& a, float b, const Option& opt) namespace BinaryOp_riscv_functor { -#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ - struct NAME \ - { \ - __fp16 operator()(const __fp16& x, const __fp16& y) const \ - { \ - return IMPL; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const word_type vl) const \ - { \ - return IMPLVV; \ - } \ - vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const word_type vl) const \ - { \ - return IMPLVS; \ - } \ - vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const word_type vl) const \ - { \ - return IMPLSV; \ - } \ +#define MAKE_FUNCTION(NAME, IMPL, IMPLVV, IMPLVS, IMPLSV) \ + struct NAME \ + { \ + __fp16 operator()(const __fp16& x, const __fp16& y) const \ + { \ + return IMPL; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLVV; \ + } \ + vfloat16m8_t operator()(const vfloat16m8_t& x, const float y, const size_t vl) const \ + { \ + return IMPLVS; \ + } \ + vfloat16m8_t operator()(const float x, const vfloat16m8_t& y, const size_t vl) const \ + { \ + return IMPLSV; \ + } \ }; // clang-format off diff --git a/src/layer/riscv/cast_riscv.cpp b/src/layer/riscv/cast_riscv.cpp index 8ea5d0f05ef..5d0642e7da7 100644 --- a/src/layer/riscv/cast_riscv.cpp +++ b/src/layer/riscv/cast_riscv.cpp @@ -101,7 +101,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat16m4_t _outp = vfncvt_f_f_w_f16m4(_p, vl); @@ -125,7 +125,7 @@ int Cast_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat32m8_t _outp = vfwcvt_f_f_v_f32m8(_p, vl); diff --git a/src/layer/riscv/clip_riscv.cpp b/src/layer/riscv/clip_riscv.cpp index 9acff0218f0..8c43e06a4d8 100644 --- a/src/layer/riscv/clip_riscv.cpp +++ b/src/layer/riscv/clip_riscv.cpp @@ -62,7 +62,7 @@ int Clip_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmax_vf_f32m8(_p, min, vl); @@ -107,7 +107,7 @@ int Clip_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfmax_vf_f32m8(_p, min, vl); @@ -139,7 +139,7 @@ int Clip_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmax_vf_f16m8(_p, min, vl); diff --git a/src/layer/riscv/concat_riscv.cpp b/src/layer/riscv/concat_riscv.cpp index d80d9985b47..5736fd25dcd 100644 --- a/src/layer/riscv/concat_riscv.cpp +++ b/src/layer/riscv/concat_riscv.cpp @@ -143,7 +143,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -266,7 +266,7 @@ int Concat_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int size = bottom_blob.w * bottom_blob.h; @@ -487,7 +487,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); for (int i = 0; i < bottom_blob.h; i++) { @@ -610,7 +610,7 @@ int Concat_riscv::forward_bf16s_fp16s(const std::vector& bottom_blobs, std: #if __riscv_vector if (bottom_blob.elempack == packn && elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = bottom_blob.w * bottom_blob.h; diff --git a/src/layer/riscv/convolution1d_riscv.cpp b/src/layer/riscv/convolution1d_riscv.cpp index a956d394f17..483aa511672 100644 --- a/src/layer/riscv/convolution1d_riscv.cpp +++ b/src/layer/riscv/convolution1d_riscv.cpp @@ -119,7 +119,7 @@ int Convolution1D_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Op #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -476,7 +476,7 @@ int Convolution1D_riscv::create_pipeline_fp16s(const Option& opt) int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -697,7 +697,7 @@ int Convolution1D_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, co int Convolution1D_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/convolution_1x1_packn.h b/src/layer/riscv/convolution_1x1_packn.h index 8f55d260abc..31bf72ba3d0 100644 --- a/src/layer/riscv/convolution_1x1_packn.h +++ b/src/layer/riscv/convolution_1x1_packn.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, con static void conv1x1s2_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packn_fp16s.h b/src/layer/riscv/convolution_1x1_packn_fp16s.h index 110d61dc121..5ac3f8967ce 100644 --- a/src/layer/riscv/convolution_1x1_packn_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packn_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_bl static void conv1x1s2_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packnto1.h b/src/layer/riscv/convolution_1x1_packnto1.h index 0cd1747586e..a3e1204a325 100644 --- a/src/layer/riscv/convolution_1x1_packnto1.h +++ b/src/layer/riscv/convolution_1x1_packnto1.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv1x1s2_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h index 04e86f97dca..10591ab27f2 100644 --- a/src/layer/riscv/convolution_1x1_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_1x1_packnto1_fp16s.h @@ -28,7 +28,7 @@ static void conv1x1s1_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top static void conv1x1s2_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_3x3_pack1ton.h b/src/layer/riscv/convolution_3x3_pack1ton.h index bb123ef8997..9adcfb1e263 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton.h +++ b/src/layer/riscv/convolution_3x3_pack1ton.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const static void conv3x3s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h index e25c7d09097..bff24a0099f 100644 --- a/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_3x3_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int inch = bottom_blob.c; int outw = top_blob.w; @@ -290,7 +290,7 @@ static void conv3x3s1_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void conv3x3s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_7x7_pack1ton.h b/src/layer/riscv/convolution_7x7_pack1ton.h index 06c4dfe2f6a..3605ed027cd 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton.h +++ b/src/layer/riscv/convolution_7x7_pack1ton.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h index 91ee1b7d826..01804bf391d 100644 --- a/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_7x7_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void conv7x7s2_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_pack1ton.h b/src/layer/riscv/convolution_pack1ton.h index f667f4d5d09..15eec7badd9 100644 --- a/src/layer/riscv/convolution_pack1ton.h +++ b/src/layer/riscv/convolution_pack1ton.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_pack1ton_fp16s.h b/src/layer/riscv/convolution_pack1ton_fp16s.h index fc486173031..6f8c649e632 100644 --- a/src/layer/riscv/convolution_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -95,7 +95,7 @@ static void convolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packn.h b/src/layer/riscv/convolution_packn.h index c9b51d07881..9d18c1d858e 100644 --- a/src/layer/riscv/convolution_packn.h +++ b/src/layer/riscv/convolution_packn.h @@ -15,7 +15,7 @@ static void convolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packn_fp16s.h b/src/layer/riscv/convolution_packn_fp16s.h index 8ae4468495a..1f7b308e846 100644 --- a/src/layer/riscv/convolution_packn_fp16s.h +++ b/src/layer/riscv/convolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -100,7 +100,7 @@ static void convolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, c static void convolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packnto1.h b/src/layer/riscv/convolution_packnto1.h index 7eda3858083..4c66116d20e 100644 --- a/src/layer/riscv/convolution_packnto1.h +++ b/src/layer/riscv/convolution_packnto1.h @@ -15,7 +15,7 @@ static void convolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_packnto1_fp16s.h b/src/layer/riscv/convolution_packnto1_fp16s.h index 63aefbb5d5a..83efd3081f8 100644 --- a/src/layer/riscv/convolution_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; @@ -109,7 +109,7 @@ static void convolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob static void convolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int channels = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm.h b/src/layer/riscv/convolution_sgemm.h index c62db6c78ee..801b7cc456f 100644 --- a/src/layer/riscv/convolution_sgemm.h +++ b/src/layer/riscv/convolution_sgemm.h @@ -16,7 +16,7 @@ static void im2col_sgemm_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_fp16s.h b/src/layer/riscv/convolution_sgemm_fp16s.h index 5cd5ea8a31e..72a621641db 100644 --- a/src/layer/riscv/convolution_sgemm_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_fp16s.h @@ -16,7 +16,7 @@ static void im2col_sgemm_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, con { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #endif // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_pack1ton.h b/src/layer/riscv/convolution_sgemm_pack1ton.h index bc2f558a6d9..8a3e6ffbc43 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h index c3590a6ed6b..0c0b2791a8f 100644 --- a/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_pack1ton_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u, 1, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_sgemm_packn.h b/src/layer/riscv/convolution_sgemm_packn.h index 88518a23136..9255c092ae4 100644 --- a/src/layer/riscv/convolution_sgemm_packn.h +++ b/src/layer/riscv/convolution_sgemm_packn.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -78,7 +78,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -119,7 +119,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -156,7 +156,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons #else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -363,7 +363,7 @@ static void im2col_sgemm_packn_rvv(const Mat& bottom_im2col, Mat& top_blob, cons static void convolution_im2col_sgemm_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packn_fp16s.h b/src/layer/riscv/convolution_sgemm_packn_fp16s.h index 977dc38204a..cb3b65196ed 100644 --- a/src/layer/riscv/convolution_sgemm_packn_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packn_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -109,7 +109,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -172,7 +172,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -228,7 +228,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo #else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -435,7 +435,7 @@ static void im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blo static void convolution_im2col_sgemm_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packnto1.h b/src/layer/riscv/convolution_sgemm_packnto1.h index 212cf98b39b..2df2c7d7656 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1.h +++ b/src/layer/riscv/convolution_sgemm_packnto1.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_im2col(size, maxk, inch, 4u * packn, packn, opt.workspace_allocator); @@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vfloat32m1_t _val5 = vle32_v_f32m1(img0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(img0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(img0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(img0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(img0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c #else vfloat32m1_t _val0 = vle32_v_f32m1(img0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(img0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c int nn_outch = outch / packn; int remain_outch_start = nn_outch * packn; +#ifdef __clang__ + // clang complains about VLA in the following loop + float* _zero_tmp = new float[packn](); + for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++) + { + _zero_tmp[_zero_clean_idx] = 0.f; + } +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_outch; pp++) { @@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c float* outptr0 = top_blob.channel(p); +#ifdef __clang__ + const float* zeros = _zero_tmp; +#else const float zeros[packn] = {0.f}; +#endif // __clang__ const float* biasptr = bias ? bias + p : zeros; int i = 0; @@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0 + 6, top_blob.cstep * sizeof(float), _sum6, vl); vsse32_v_f32m1(outptr0 + 7, top_blob.cstep * sizeof(float), _sum7, vl); #else - vssseg8e32_v_f32m1x8(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); + vssseg8e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); #endif outptr0 += 8; } @@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0 + 2, top_blob.cstep * sizeof(float), _sum2, vl); vsse32_v_f32m1(outptr0 + 3, top_blob.cstep * sizeof(float), _sum3, vl); #else - vssseg4e32_v_f32m1x4(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x4(_sum0, _sum1, _sum2, _sum3), vl); + vssseg4e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, _sum2, _sum3, vl); #endif outptr0 += 4; } @@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c vsse32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, vl); vsse32_v_f32m1(outptr0 + 1, top_blob.cstep * sizeof(float), _sum1, vl); #else - vssseg2e32_v_f32m1x2(outptr0, top_blob.cstep * sizeof(float), vcreate_f32m1x2(_sum0, _sum1), vl); + vssseg2e32_v_f32m1(outptr0, top_blob.cstep * sizeof(float), _sum0, _sum1, vl); #endif outptr0 += 2; } @@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c outptr0 += 1; } } +#ifdef __clang__ + delete[] _zero_tmp; +#endif #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_outch_start; p < outch; p++) @@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x8_t _val01 = vlseg8e32_v_f32m1x8(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vfloat32m1_t _val2; + vfloat32m1_t _val3; + vfloat32m1_t _val4; + vfloat32m1_t _val5; + vfloat32m1_t _val6; + vfloat32m1_t _val7; + vlseg8e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x8_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x8_f32m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x8_f32m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x8_f32m1(_val01, 3), _w0, vl); - _sum4 = vfmacc_vv_f32m1(_sum4, vget_f32m1x8_f32m1(_val01, 4), _w0, vl); - _sum5 = vfmacc_vv_f32m1(_sum5, vget_f32m1x8_f32m1(_val01, 5), _w0, vl); - _sum6 = vfmacc_vv_f32m1(_sum6, vget_f32m1x8_f32m1(_val01, 6), _w0, vl); - _sum7 = vfmacc_vv_f32m1(_sum7, vget_f32m1x8_f32m1(_val01, 7), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); + _sum4 = vfmacc_vv_f32m1(_sum4, _val4, _w0, vl); + _sum5 = vfmacc_vv_f32m1(_sum5, _val5, _w0, vl); + _sum6 = vfmacc_vv_f32m1(_sum6, _val6, _w0, vl); + _sum7 = vfmacc_vv_f32m1(_sum7, _val7, _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -463,12 +486,16 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x4_t _val01 = vlseg4e32_v_f32m1x4(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vfloat32m1_t _val2; + vfloat32m1_t _val3; + vlseg4e32_v_f32m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x4_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x4_f32m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f32m1(_sum2, vget_f32m1x4_f32m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f32m1(_sum3, vget_f32m1x4_f32m1(_val01, 3), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f32m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f32m1(_sum3, _val3, _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -519,10 +546,12 @@ static void im2col_sgemm_packnto1_rvv(const Mat& bottom_im2col, Mat& top_blob, c for (int j = 0; j < nn; j++) { - vfloat32m1x2_t _val01 = vlseg2e32_v_f32m1x2(tmpptr, vl); + vfloat32m1_t _val0; + vfloat32m1_t _val1; + vlseg2e32_v_f32m1(&_val0, &_val1, tmpptr, vl); vfloat32m1_t _w0 = vle32_v_f32m1(kptr0, vl); - _sum0 = vfmacc_vv_f32m1(_sum0, vget_f32m1x2_f32m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f32m1(_sum1, vget_f32m1x2_f32m1(_val01, 1), _w0, vl); + _sum0 = vfmacc_vv_f32m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f32m1(_sum1, _val1, _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -648,7 +677,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_rvv(const Mat& _k static void convolution_im2col_sgemm_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h index d6dd867397c..925713d9826 100644 --- a/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h +++ b/src/layer/riscv/convolution_sgemm_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_im2col(size, maxk, inch, 2u * packn, packn, opt.workspace_allocator); @@ -77,7 +77,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vfloat16m1_t _val5 = vle16_v_f16m1(img0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(img0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(img0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); img0 += size * packn; tmpptr += packn * 8; @@ -118,7 +118,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(img0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(img0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); img0 += size * packn; tmpptr += packn * 4; @@ -155,7 +155,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ #else vfloat16m1_t _val0 = vle16_v_f16m1(img0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(img0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); img0 += size * packn; tmpptr += packn * 2; @@ -190,6 +190,14 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ int nn_outch = outch / packn; int remain_outch_start = nn_outch * packn; + // make clang happy with the following loop +#ifdef __clang__ + __fp16* _zero_tmp = new __fp16[packn](); + for (int _zero_clean_idx = 0; _zero_clean_idx < packn; _zero_clean_idx++) + { + _zero_tmp[_zero_clean_idx] = 0.f; + } +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int pp = 0; pp < nn_outch; pp++) { @@ -197,7 +205,11 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ __fp16* outptr0 = top_blob.channel(p); +#ifdef __clang__ + const __fp16* zeros = _zero_tmp; +#else const __fp16 zeros[packn] = {0.f}; +#endif // __clang__ const __fp16* biasptr = bias ? bias + p : zeros; int i = 0; @@ -250,7 +262,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0 + 6, top_blob.cstep * sizeof(__fp16), _sum6, vl); vsse16_v_f16m1(outptr0 + 7, top_blob.cstep * sizeof(__fp16), _sum7, vl); #else - vssseg8e16_v_f16m1x8(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x8(_sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7), vl); + vssseg8e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, _sum4, _sum5, _sum6, _sum7, vl); #endif outptr0 += 8; } @@ -287,7 +299,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0 + 2, top_blob.cstep * sizeof(__fp16), _sum2, vl); vsse16_v_f16m1(outptr0 + 3, top_blob.cstep * sizeof(__fp16), _sum3, vl); #else - vssseg4e16_v_f16m1x4(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x4(_sum0, _sum1, _sum2, _sum3), vl); + vssseg4e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, _sum2, _sum3, vl); #endif outptr0 += 4; } @@ -316,7 +328,7 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ vsse16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, vl); vsse16_v_f16m1(outptr0 + 1, top_blob.cstep * sizeof(__fp16), _sum1, vl); #else - vssseg2e16_v_f16m1x2(outptr0, top_blob.cstep * sizeof(__fp16), vcreate_f16m1x2(_sum0, _sum1), vl); + vssseg2e16_v_f16m1(outptr0, top_blob.cstep * sizeof(__fp16), _sum0, _sum1, vl); #endif outptr0 += 2; } @@ -343,6 +355,9 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ outptr0 += 1; } } +#ifdef __clang__ + delete[] _zero_tmp; +#endif // __clang__ #pragma omp parallel for num_threads(opt.num_threads) for (int p = remain_outch_start; p < outch; p++) @@ -379,16 +394,24 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x8_t _val01 = vlseg8e16_v_f16m1x8(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vfloat16m1_t _val2; + vfloat16m1_t _val3; + vfloat16m1_t _val4; + vfloat16m1_t _val5; + vfloat16m1_t _val6; + vfloat16m1_t _val7; + vlseg8e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, &_val4, &_val5, &_val6, &_val7, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x8_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x8_f16m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x8_f16m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x8_f16m1(_val01, 3), _w0, vl); - _sum4 = vfmacc_vv_f16m1(_sum4, vget_f16m1x8_f16m1(_val01, 4), _w0, vl); - _sum5 = vfmacc_vv_f16m1(_sum5, vget_f16m1x8_f16m1(_val01, 5), _w0, vl); - _sum6 = vfmacc_vv_f16m1(_sum6, vget_f16m1x8_f16m1(_val01, 6), _w0, vl); - _sum7 = vfmacc_vv_f16m1(_sum7, vget_f16m1x8_f16m1(_val01, 7), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); + _sum4 = vfmacc_vv_f16m1(_sum4, _val4, _w0, vl); + _sum5 = vfmacc_vv_f16m1(_sum5, _val5, _w0, vl); + _sum6 = vfmacc_vv_f16m1(_sum6, _val6, _w0, vl); + _sum7 = vfmacc_vv_f16m1(_sum7, _val7, _w0, vl); tmpptr += packn * 8; kptr0 += packn; } @@ -463,12 +486,17 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x4_t _val01 = vlseg4e16_v_f16m1x4(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vfloat16m1_t _val2; + vfloat16m1_t _val3; + + vlseg4e16_v_f16m1(&_val0, &_val1, &_val2, &_val3, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x4_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x4_f16m1(_val01, 1), _w0, vl); - _sum2 = vfmacc_vv_f16m1(_sum2, vget_f16m1x4_f16m1(_val01, 2), _w0, vl); - _sum3 = vfmacc_vv_f16m1(_sum3, vget_f16m1x4_f16m1(_val01, 3), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); + _sum2 = vfmacc_vv_f16m1(_sum2, _val2, _w0, vl); + _sum3 = vfmacc_vv_f16m1(_sum3, _val3, _w0, vl); tmpptr += packn * 4; kptr0 += packn; } @@ -519,10 +547,12 @@ static void im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_im2col, Mat& top_ for (int j = 0; j < nn; j++) { - vfloat16m1x2_t _val01 = vlseg2e16_v_f16m1x2(tmpptr, vl); + vfloat16m1_t _val0; + vfloat16m1_t _val1; + vlseg2e16_v_f16m1(&_val0, &_val1, tmpptr, vl); vfloat16m1_t _w0 = vle16_v_f16m1(kptr0, vl); - _sum0 = vfmacc_vv_f16m1(_sum0, vget_f16m1x2_f16m1(_val01, 0), _w0, vl); - _sum1 = vfmacc_vv_f16m1(_sum1, vget_f16m1x2_f16m1(_val01, 1), _w0, vl); + _sum0 = vfmacc_vv_f16m1(_sum0, _val0, _w0, vl); + _sum1 = vfmacc_vv_f16m1(_sum1, _val1, _w0, vl); tmpptr += packn * 2; kptr0 += packn; } @@ -648,7 +678,7 @@ static void convolution_im2col_sgemm_transform_kernel_packnto1_fp16sa_rvv(const static void convolution_im2col_sgemm_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int inch = bottom_blob.c; diff --git a/src/layer/riscv/convolution_winograd_dot.h b/src/layer/riscv/convolution_winograd_dot.h index 8ea6bc9c576..c0a7b7680f8 100644 --- a/src/layer/riscv/convolution_winograd_dot.h +++ b/src/layer/riscv/convolution_winograd_dot.h @@ -16,7 +16,7 @@ static void convolution_winograd_dot_rvv(Mat& bottom_blob_tm, int outch, const M { #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u, opt.workspace_allocator); diff --git a/src/layer/riscv/convolution_winograd_dot_packn.h b/src/layer/riscv/convolution_winograd_dot_packn.h index 434eaa00c68..1c505d5c2e1 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn.h +++ b/src/layer/riscv/convolution_winograd_dot_packn.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 4u * packn, packn, opt.workspace_allocator); @@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c vfloat32m1_t _val5 = vle32_v_f32m1(r0 + packn * 5, vl); vfloat32m1_t _val6 = vle32_v_f32m1(r0 + packn * 6, vl); vfloat32m1_t _val7 = vle32_v_f32m1(r0 + packn * 7, vl); - vsseg8e32_v_f32m1x8(tmpptr, vcreate_f32m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); vfloat32m1_t _val2 = vle32_v_f32m1(r0 + packn * 2, vl); vfloat32m1_t _val3 = vle32_v_f32m1(r0 + packn * 3, vl); - vsseg4e32_v_f32m1x4(tmpptr, vcreate_f32m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e32_v_f32m1(tmpptr, _val0, _val1, _val2, _val3, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_rvv(Mat& bottom_blob_tm, int outch, c #else vfloat32m1_t _val0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _val1 = vle32_v_f32m1(r0 + packn, vl); - vsseg2e32_v_f32m1x2(tmpptr, vcreate_f32m1x2(_val0, _val1), vl); + vsseg2e32_v_f32m1(tmpptr, _val0, _val1, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; diff --git a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h index 0b731519426..ed35ad3e378 100644 --- a/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_dot_packn_fp16s.h @@ -15,7 +15,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int outch, const Mat& kernel_tm, Mat& top_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); // Mat bottom_blob_tm(tiles, 16/36/64, inch, 2u * packn, packn, opt.workspace_allocator); @@ -75,7 +75,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o vfloat16m1_t _val5 = vle16_v_f16m1(r0 + packn * 5, vl); vfloat16m1_t _val6 = vle16_v_f16m1(r0 + packn * 6, vl); vfloat16m1_t _val7 = vle16_v_f16m1(r0 + packn * 7, vl); - vsseg8e16_v_f16m1x8(tmpptr, vcreate_f16m1x8(_val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7), vl); + vsseg8e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, _val4, _val5, _val6, _val7, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 8; @@ -108,7 +108,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); vfloat16m1_t _val2 = vle16_v_f16m1(r0 + packn * 2, vl); vfloat16m1_t _val3 = vle16_v_f16m1(r0 + packn * 3, vl); - vsseg4e16_v_f16m1x4(tmpptr, vcreate_f16m1x4(_val0, _val1, _val2, _val3), vl); + vsseg4e16_v_f16m1(tmpptr, _val0, _val1, _val2, _val3, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 4; @@ -137,7 +137,7 @@ static void convolution_winograd_dot_packn_fp16sa_rvv(Mat& bottom_blob_tm, int o #else vfloat16m1_t _val0 = vle16_v_f16m1(r0, vl); vfloat16m1_t _val1 = vle16_v_f16m1(r0 + packn, vl); - vsseg2e16_v_f16m1x2(tmpptr, vcreate_f16m1x2(_val0, _val1), vl); + vsseg2e16_v_f16m1(tmpptr, _val0, _val1, vl); r0 += bottom_blob_tm.cstep * packn; tmpptr += packn * 2; diff --git a/src/layer/riscv/convolution_winograd_transform_packn.h b/src/layer/riscv/convolution_winograd_transform_packn.h index db3a05aa92f..f5a52970759 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn.h +++ b/src/layer/riscv/convolution_winograd_transform_packn.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_rvv(const Mat& top_blob_ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_rvv(const Mat& bottom_blo static void conv3x3s1_winograd23_transform_output_packn_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; diff --git a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h index b1b1ad9f54d..2404a8a4092 100644 --- a/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h +++ b/src/layer/riscv/convolution_winograd_transform_packn_fp16s.h @@ -15,7 +15,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -180,7 +180,7 @@ static void conv3x3s1_winograd63_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -323,7 +323,7 @@ static void conv3x3s1_winograd63_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -436,7 +436,7 @@ static void conv3x3s1_winograd43_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; @@ -553,7 +553,7 @@ static void conv3x3s1_winograd43_transform_output_packn_fp16sa_rvv(const Mat& to static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& bottom_blob_tm, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int w = bottom_blob.w; const int h = bottom_blob.h; @@ -646,7 +646,7 @@ static void conv3x3s1_winograd23_transform_input_packn_fp16sa_rvv(const Mat& bot static void conv3x3s1_winograd23_transform_output_packn_fp16sa_rvv(const Mat& top_blob_tm, Mat& top_blob, const Mat& bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const int outw = top_blob.w; const int outh = top_blob.h; diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn.h b/src/layer/riscv/convolutiondepthwise_3x3_packn.h index d8aa0ec4ee0..0cab1af0802 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw3x3s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h index c3d73053bea..d479385f6a2 100644 --- a/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_3x3_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -302,7 +302,7 @@ static void convdw3x3s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw3x3s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn.h b/src/layer/riscv/convolutiondepthwise_5x5_packn.h index cd35ef8e816..2ef2fea7455 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const M static void convdw5x5s2_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h index 1647f96db8c..08270e307c9 100644 --- a/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h +++ b/src/layer/riscv/convolutiondepthwise_5x5_packn_fp16s.h @@ -15,7 +15,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; @@ -336,7 +336,7 @@ static void convdw5x5s1_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, static void convdw5x5s2_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& kernel, const Mat& _bias, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; diff --git a/src/layer/riscv/convolutiondepthwise_riscv.cpp b/src/layer/riscv/convolutiondepthwise_riscv.cpp index e33360e0609..eb39ac0baa7 100644 --- a/src/layer/riscv/convolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/convolutiondepthwise_riscv.cpp @@ -282,7 +282,7 @@ int ConvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, c #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -710,7 +710,7 @@ int ConvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -920,7 +920,7 @@ int ConvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_b int ConvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/crop_riscv.cpp b/src/layer/riscv/crop_riscv.cpp index f7b44efd1a1..80e76fc47b4 100644 --- a/src/layer/riscv/crop_riscv.cpp +++ b/src/layer/riscv/crop_riscv.cpp @@ -43,7 +43,7 @@ static void crop_packn_rvv(const Mat& src, Mat& dst, int top, int left, int pack int h = dst.h; int right = src.w - dst.w - left; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float* ptr = src.row(top) + left * packn; float* outptr = dst; @@ -69,7 +69,7 @@ static void crop_packn_bf16_fp16s_rvv(const Mat& src, Mat& dst, int top, int lef int h = dst.h; int right = src.w - dst.w - left; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const unsigned short* ptr = src.row(top) + left * packn; unsigned short* outptr = dst; diff --git a/src/layer/riscv/deconvolution_pack1ton.h b/src/layer/riscv/deconvolution_pack1ton.h index dfbe8e01a2d..ec18f62c1c6 100644 --- a/src/layer/riscv/deconvolution_pack1ton.h +++ b/src/layer/riscv/deconvolution_pack1ton.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_pack1ton, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_pack1ton_fp16s.h b/src/layer/riscv/deconvolution_pack1ton_fp16s.h index a1fcfefc254..168c709217d 100644 --- a/src/layer/riscv/deconvolution_pack1ton_fp16s.h +++ b/src/layer/riscv/deconvolution_pack1ton_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -103,7 +103,7 @@ static void deconvolution_pack1ton_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_pack1ton_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packn.h b/src/layer/riscv/deconvolution_packn.h index 457e2b95c92..8cab6c3b0a1 100644 --- a/src/layer/riscv/deconvolution_packn.h +++ b/src/layer/riscv/deconvolution_packn.h @@ -15,7 +15,7 @@ static void deconvolution_packn_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packn, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packn_fp16s.h b/src/layer/riscv/deconvolution_packn_fp16s.h index 46d52470ad0..62fbd2eb731 100644 --- a/src/layer/riscv/deconvolution_packn_fp16s.h +++ b/src/layer/riscv/deconvolution_packn_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -105,7 +105,7 @@ static void deconvolution_packn_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, static void deconvolution_packn_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packnto1.h b/src/layer/riscv/deconvolution_packnto1.h index ba81baf3676..2efa9b154d2 100644 --- a/src/layer/riscv/deconvolution_packnto1.h +++ b/src/layer/riscv/deconvolution_packnto1.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_packnto1, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolution_packnto1_fp16s.h b/src/layer/riscv/deconvolution_packnto1_fp16s.h index 5cb0a3c49bd..ab70100fb3b 100644 --- a/src/layer/riscv/deconvolution_packnto1_fp16s.h +++ b/src/layer/riscv/deconvolution_packnto1_fp16s.h @@ -15,7 +15,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -116,7 +116,7 @@ static void deconvolution_packnto1_fp16s_rvv(const Mat& bottom_blob, Mat& top_bl static void deconvolution_packnto1_fp16sa_rvv(const Mat& bottom_blob, Mat& top_blob, const Mat& weight_data_fp16, const Mat& bias_data_fp16, int kernel_w, int kernel_h, int dilation_w, int dilation_h, int stride_w, int stride_h, int activation_type, const Mat& activation_params, const Option& opt) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp index ab20e6c4148..b53e8962fd2 100644 --- a/src/layer/riscv/deconvolutiondepthwise_riscv.cpp +++ b/src/layer/riscv/deconvolutiondepthwise_riscv.cpp @@ -210,7 +210,7 @@ int DeconvolutionDepthWise_riscv::forward(const Mat& bottom_blob, Mat& top_blob, #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif // convolv with NxN kernel @@ -518,7 +518,7 @@ int DeconvolutionDepthWise_riscv::create_pipeline_fp16s(const Option& opt) int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -739,7 +739,7 @@ int DeconvolutionDepthWise_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top int DeconvolutionDepthWise_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/dropout_riscv.cpp b/src/layer/riscv/dropout_riscv.cpp index fc71db7689a..461edf2d056 100644 --- a/src/layer/riscv/dropout_riscv.cpp +++ b/src/layer/riscv/dropout_riscv.cpp @@ -53,7 +53,7 @@ int Dropout_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vf_f32m8(_p, scale, vl); diff --git a/src/layer/riscv/flatten_riscv.cpp b/src/layer/riscv/flatten_riscv.cpp index 325ab6f175d..491c051c7fe 100644 --- a/src/layer/riscv/flatten_riscv.cpp +++ b/src/layer/riscv/flatten_riscv.cpp @@ -119,7 +119,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); vsse32_v_f32m1(outptr, w * sizeof(float), _p, vl); @@ -147,7 +147,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p = vle32_v_f32m1(ptr, vl); vsse32_v_f32m1(outptr, size * sizeof(float), _p, vl); @@ -172,7 +172,7 @@ int Flatten_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vse32_v_f32m8(outptr, _p, vl); @@ -262,7 +262,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p = vle16_v_u16m1(ptr, vl); vsse16_v_u16m1(outptr, w * sizeof(unsigned short), _p, vl); @@ -290,7 +290,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p = vle16_v_u16m1(ptr, vl); vsse16_v_u16m1(outptr, size * sizeof(unsigned short), _p, vl); @@ -315,7 +315,7 @@ int Flatten_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vuint16m8_t _p = vle16_v_u16m8(ptr, vl); vse16_v_u16m8(outptr, _p, vl); @@ -405,7 +405,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e8m1(n); + size_t vl = vsetvl_e8m1(n); vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, w * sizeof(unsigned char), _p, vl); @@ -433,7 +433,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e8m1(n); + size_t vl = vsetvl_e8m1(n); vint8m1_t _p = vle8_v_i8m1(ptr, vl); vsse8_v_i8m1(outptr, size * sizeof(signed char), _p, vl); @@ -458,7 +458,7 @@ int Flatten_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e8m8(n); + size_t vl = vsetvl_e8m8(n); vint8m8_t _p = vle8_v_i8m8(ptr, vl); vse8_v_i8m8(outptr, _p, vl); diff --git a/src/layer/riscv/gelu_riscv.cpp b/src/layer/riscv/gelu_riscv.cpp index 708e951e5a3..69b374998f3 100644 --- a/src/layer/riscv/gelu_riscv.cpp +++ b/src/layer/riscv/gelu_riscv.cpp @@ -48,7 +48,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vfloat32m4_t _p = vle32_v_f32m4(ptr, vl); @@ -77,7 +77,7 @@ int GELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); auto _p = vle32_v_f32m8(ptr, vl); auto _perfc = vfmul_vf_f32m8(_p, -.70710678f, vl); _p = vfmul_vf_f32m8(_p, .5f, vl); diff --git a/src/layer/riscv/gru_riscv.cpp b/src/layer/riscv/gru_riscv.cpp index e45d37592ef..28afa5081d0 100644 --- a/src/layer/riscv/gru_riscv.cpp +++ b/src/layer/riscv/gru_riscv.cpp @@ -63,7 +63,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _x = vle32_v_f32m8(ptr_x, vl); vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); @@ -93,7 +93,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e32m8(n_out); + size_t vl = vsetvl_e32m8(n_out); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); @@ -136,7 +136,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e32m8(n_out2); + size_t vl = vsetvl_e32m8(n_out2); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); @@ -160,7 +160,7 @@ static int gru(const Mat& bottom_blob, Mat& top_blob, int reverse, const Mat& we const float* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _x = vle32_v_f32m8(ptr_x2, vl); vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); @@ -428,7 +428,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x, vl), vl); vfloat32m8_t _xcr = vle32_v_f32m8(ptr_xcr, vl); vfloat32m8_t _xcu = vle32_v_f32m8(ptr_xcu, vl); @@ -458,7 +458,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e16m4(n_out); + size_t vl = vsetvl_e16m4(n_out); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc, vl); vfloat32m8_t _hcr = vle32_v_f32m8(ptr_hcr, vl); vfloat32m8_t _hcu = vle32_v_f32m8(ptr_hcu, vl); @@ -501,7 +501,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e16m4(n_out2); + size_t vl = vsetvl_e16m4(n_out2); vfloat32m8_t _h_cont = vle32_v_f32m8(ptr_hc2, vl); vfloat32m8_t _whc_n = vle32_v_f32m8(ptr_whc_n, vl); @@ -525,7 +525,7 @@ static int gru_fp16s(const Mat& bottom_blob, Mat& top_blob, int reverse, const M const float* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e16m4(n2); + size_t vl = vsetvl_e16m4(n2); vfloat32m8_t _x = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr_x2, vl), vl); vfloat32m8_t _xcn = vle32_v_f32m8(ptr_xcn, vl); @@ -758,7 +758,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_xcu = weight_xc_U; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _x = vle16_v_f16m8(ptr_x, vl); vfloat16m8_t _xcr = vle16_v_f16m8(ptr_xcr, vl); vfloat16m8_t _xcu = vle16_v_f16m8(ptr_xcu, vl); @@ -785,7 +785,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_hcu = weight_hc_U; while (n_out > 0) { - word_type vl = vsetvl_e16m4(n_out); + size_t vl = vsetvl_e16m4(n_out); vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc, vl), vl); vfloat16m4_t _hcr = vle16_v_f16m4(ptr_hcr, vl); vfloat16m4_t _hcu = vle16_v_f16m4(ptr_hcu, vl); @@ -825,7 +825,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_whc_n = weight_hc_N; while (n_out2 > 0) { - word_type vl = vsetvl_e16m4(n_out2); + size_t vl = vsetvl_e16m4(n_out2); vfloat16m4_t _h_cont = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_hc2, vl), vl); vfloat16m4_t _whc_n = vle16_v_f16m4(ptr_whc_n, vl); @@ -846,7 +846,7 @@ static int gru_fp16sa(const Mat& bottom_blob, Mat& top_blob, int reverse, const const __fp16* ptr_xcn = weight_xc_N; while (n2 > 0) { - word_type vl = vsetvl_e16m8(n2); + size_t vl = vsetvl_e16m8(n2); vfloat16m8_t _x = vle16_v_f16m8(ptr_x2, vl); vfloat16m8_t _xcn = vle16_v_f16m8(ptr_xcn, vl); diff --git a/src/layer/riscv/hardsigmoid_riscv.cpp b/src/layer/riscv/hardsigmoid_riscv.cpp index 2c3bbec2886..112a1c9c8d2 100644 --- a/src/layer/riscv/hardsigmoid_riscv.cpp +++ b/src/layer/riscv/hardsigmoid_riscv.cpp @@ -60,7 +60,7 @@ int HardSigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); @@ -111,7 +111,7 @@ int HardSigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); diff --git a/src/layer/riscv/hardswish_riscv.cpp b/src/layer/riscv/hardswish_riscv.cpp index b60197115ca..5d68e07b06a 100644 --- a/src/layer/riscv/hardswish_riscv.cpp +++ b/src/layer/riscv/hardswish_riscv.cpp @@ -60,7 +60,7 @@ int HardSwish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) co int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, lower, vl); @@ -111,7 +111,7 @@ int HardSwish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& o int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, lower, vl); diff --git a/src/layer/riscv/innerproduct_riscv.cpp b/src/layer/riscv/innerproduct_riscv.cpp index 721c6361b8b..30dd7428777 100644 --- a/src/layer/riscv/innerproduct_riscv.cpp +++ b/src/layer/riscv/innerproduct_riscv.cpp @@ -198,7 +198,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #if __riscv_vector if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -237,7 +237,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -273,7 +273,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); float* outptr = top_blob.row(j); @@ -372,7 +372,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); if (bias_term) @@ -414,7 +414,7 @@ int InnerProduct_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Opt { int p = pp * packn; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); vfloat32m1_t _sum = vfmv_v_f_f32m1(0.f, vl); if (bias_term) @@ -595,7 +595,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con { if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -635,7 +635,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -672,7 +672,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -765,7 +765,7 @@ int InnerProduct_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, con #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); vfloat32m2_t _sum = vfmv_v_f_f32m2(0.f, vl); if (bias_term) @@ -857,7 +857,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co { if (elempack == packn && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -897,7 +897,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (elempack == 1 && num_output_elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -934,7 +934,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co if (elempack == packn && num_output_elempack == 1) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); __fp16* outptr = top_blob.row<__fp16>(j); @@ -1027,7 +1027,7 @@ int InnerProduct_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, co #pragma omp parallel for num_threads(opt.num_threads) for (int p = 0; p < num_output / out_elempack; p++) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); vfloat16m1_t _sum = vfmv_v_f_f16m1(0.f, vl); if (bias_term) diff --git a/src/layer/riscv/interp_bicubic_packn.h b/src/layer/riscv/interp_bicubic_packn.h index 16ed365ff53..4c4eb869c43 100644 --- a/src/layer/riscv/interp_bicubic_packn.h +++ b/src/layer/riscv/interp_bicubic_packn.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bicubic_packn_fp16s.h b/src/layer/riscv/interp_bicubic_packn_fp16s.h index b83a9eba3c6..ff2284552b7 100644 --- a/src/layer/riscv/interp_bicubic_packn_fp16s.h +++ b/src/layer/riscv/interp_bicubic_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -244,7 +244,7 @@ static void resize_bicubic_image_packn_fp16s(const Mat& src, Mat& dst, float* al static void resize_bicubic_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bilinear.h b/src/layer/riscv/interp_bilinear.h index 1742626017a..0f6338d7310 100644 --- a/src/layer/riscv/interp_bilinear.h +++ b/src/layer/riscv/interp_bilinear.h @@ -86,16 +86,17 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl); - vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0); - vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1); - vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl); - vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0); - vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1); + vfloat32m4_t _S1p0; + vfloat32m4_t _S1p1; + vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + + vfloat32m4_t _a0; + vfloat32m4_t _a1; + vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); @@ -135,19 +136,21 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m4(n); + size_t vl = vsetvl_e32m4(n); vuint32m4_t _sx = vmul_vx_u32m4(vle32_v_u32m4(pxofs, vl), sizeof(float), vl); - vfloat32m4x2_t _S0p = vloxseg2ei32_v_f32m4x2(S0, _sx, vl); - vfloat32m4x2_t _S1p = vloxseg2ei32_v_f32m4x2(S1, _sx, vl); - vfloat32m4_t _S0p0 = vget_f32m4x2_f32m4(_S0p, 0); - vfloat32m4_t _S0p1 = vget_f32m4x2_f32m4(_S0p, 1); - vfloat32m4_t _S1p0 = vget_f32m4x2_f32m4(_S1p, 0); - vfloat32m4_t _S1p1 = vget_f32m4x2_f32m4(_S1p, 1); - vfloat32m4x2_t _a0a1 = vlseg2e32_v_f32m4x2(alphap, vl); - vfloat32m4_t _a0 = vget_f32m4x2_f32m4(_a0a1, 0); - vfloat32m4_t _a1 = vget_f32m4x2_f32m4(_a0a1, 1); + vfloat32m4_t _S0p0; + vfloat32m4_t _S0p1; + vfloat32m4_t _S1p0; + vfloat32m4_t _S1p1; + + vloxseg2ei32_v_f32m4(&_S0p0, &_S0p1, S0, _sx, vl); + vloxseg2ei32_v_f32m4(&_S1p0, &_S1p1, S1, _sx, vl); + + vfloat32m4_t _a0; + vfloat32m4_t _a1; + vlseg2e32_v_f32m4(&_a0, &_a1, alphap, vl); vfloat32m4_t _rows0 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S0p0, _a0, vl), _S0p1, _a1, vl); vfloat32m4_t _rows1 = vfmacc_vv_f32m4(vfmul_vv_f32m4(_S1p0, _a0, vl), _S1p1, _a1, vl); @@ -192,7 +195,7 @@ static void resize_bilinear_image(const Mat& src, Mat& dst, float* alpha, int* x int n = w; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); diff --git a/src/layer/riscv/interp_bilinear_fp16s.h b/src/layer/riscv/interp_bilinear_fp16s.h index 091e86b7301..cd61af6efac 100644 --- a/src/layer/riscv/interp_bilinear_fp16s.h +++ b/src/layer/riscv/interp_bilinear_fp16s.h @@ -131,7 +131,7 @@ static void resize_bilinear_image_fp16s(const Mat& src, Mat& dst, float* alpha, int n = w; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _rows0 = vle32_v_f32m8(rows0p, vl); vfloat32m8_t _rows1 = vle32_v_f32m8(rows1p, vl); @@ -232,7 +232,7 @@ static void resize_bilinear_image_fp16sa(const Mat& src, Mat& dst, __fp16* alpha int n = w; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _rows0 = vle16_v_f16m8(rows0p, vl); vfloat16m8_t _rows1 = vle16_v_f16m8(rows1p, vl); diff --git a/src/layer/riscv/interp_bilinear_packn.h b/src/layer/riscv/interp_bilinear_packn.h index 0d800e324cb..9dffc01bf30 100644 --- a/src/layer/riscv/interp_bilinear_packn.h +++ b/src/layer/riscv/interp_bilinear_packn.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_bilinear_packn_fp16s.h b/src/layer/riscv/interp_bilinear_packn_fp16s.h index b48fd8431a4..dfe02c00d1b 100644 --- a/src/layer/riscv/interp_bilinear_packn_fp16s.h +++ b/src/layer/riscv/interp_bilinear_packn_fp16s.h @@ -15,7 +15,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* alpha, int* xofs, float* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; @@ -122,7 +122,7 @@ static void resize_bilinear_image_packn_fp16s(const Mat& src, Mat& dst, float* a static void resize_bilinear_image_packn_fp16sa(const Mat& src, Mat& dst, __fp16* alpha, int* xofs, __fp16* beta, int* yofs) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = dst.w; int h = dst.h; diff --git a/src/layer/riscv/interp_riscv.cpp b/src/layer/riscv/interp_riscv.cpp index b72cfd00280..ea8344985ed 100644 --- a/src/layer/riscv/interp_riscv.cpp +++ b/src/layer/riscv/interp_riscv.cpp @@ -88,7 +88,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector #if __riscv_vector if (elempack == packn) { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) @@ -130,7 +130,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -153,7 +153,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -190,7 +190,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int* buf = new int[outw + outw * packn]; @@ -328,7 +328,7 @@ int Interp_riscv::forward(const std::vector& bottom_blobs, std::vector { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); const float hs = output_height ? h / (float)outh : 1.f / height_scale; const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -518,7 +518,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (elempack == packn) { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #pragma omp parallel for num_threads(opt.num_threads) for (int q = 0; q < w; q++) @@ -558,7 +558,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -581,7 +581,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -618,7 +618,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -754,7 +754,7 @@ int Interp_riscv::forward_fp16s(const std::vector& bottom_blobs, std::vecto { if (resize_type == 1) // nearest { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); const float hs = output_height ? h / (float)outh : 1.f / height_scale; const float ws = output_width ? w / (float)outw : 1.f / width_scale; @@ -955,7 +955,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect { if (resize_type == 2) // bilinear { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; @@ -992,7 +992,7 @@ int Interp_riscv::forward_fp16sa(const std::vector& bottom_blobs, std::vect if (resize_type == 3) // bicubic { - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int* buf = new int[outw + outw * packn]; diff --git a/src/layer/riscv/mish_riscv.cpp b/src/layer/riscv/mish_riscv.cpp index abee1ec3748..4ddb1470006 100644 --- a/src/layer/riscv/mish_riscv.cpp +++ b/src/layer/riscv/mish_riscv.cpp @@ -64,7 +64,7 @@ int Mish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); @@ -103,7 +103,7 @@ int Mish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfmul_vv_f32m8(_p, tanh_ps(log_ps(vfadd_vf_f32m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); @@ -134,7 +134,7 @@ int Mish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmul_vv_f16m8(_p, tanh_ps(log_ps(vfadd_vf_f16m8(exp_ps(_p, vl), 1.f, vl), vl), vl), vl); diff --git a/src/layer/riscv/packing_riscv.cpp b/src/layer/riscv/packing_riscv.cpp index 1805c2469eb..5c298da522d 100644 --- a/src/layer/riscv/packing_riscv.cpp +++ b/src/layer/riscv/packing_riscv.cpp @@ -18,6 +18,8 @@ #include #endif // __riscv_vector +#include "riscv_usability.h" + namespace ncnn { Packing_riscv::Packing_riscv() @@ -137,13 +139,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -181,13 +183,18 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); + + vfloat32m2_t _p0; + vfloat32m2_t _p1; + vfloat32m2_t _p2; + vfloat32m2_t _p3; + vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); - vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl); - vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl); - vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl); - vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl); - vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl); + vse32_v_f32m2(outptr0, _p0, vl); + vse32_v_f32m2(outptr1, _p1, vl); + vse32_v_f32m2(outptr2, _p2, vl); + vse32_v_f32m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -229,7 +236,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); @@ -239,7 +246,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -289,17 +296,25 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl); - vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl); - vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl); - vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl); - vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl); - vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl); - vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl); - vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vse32_v_f32m1(outptr0, _p0, vl); + vse32_v_f32m1(outptr1, _p1, vl); + vse32_v_f32m1(outptr2, _p2, vl); + vse32_v_f32m1(outptr3, _p3, vl); + vse32_v_f32m1(outptr4, _p4, vl); + vse32_v_f32m1(outptr5, _p5, vl); + vse32_v_f32m1(outptr6, _p6, vl); + vse32_v_f32m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -343,19 +358,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl); - vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl); - vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0); - vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1); - vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2); - vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3); - vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0); - vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1); - vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2); - vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p00; + vfloat32m1_t _p01; + vfloat32m1_t _p02; + vfloat32m1_t _p03; + vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + + vfloat32m1_t _p10; + vfloat32m1_t _p11; + vfloat32m1_t _p12; + vfloat32m1_t _p13; + vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -395,19 +412,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = w; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0); - vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1); - vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2); - vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3); - vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4); - vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5); - vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6); - vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7); - vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -466,13 +483,13 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m2(n); + size_t vl = vsetvl_e32m2(n); vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl); vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl); vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl); vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl); - vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e32_v_f32m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -510,13 +527,16 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m2(n); - - vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl); - vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl); - vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl); - vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl); - vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl); + size_t vl = vsetvl_e32m2(n); + vfloat32m2_t _p0; + vfloat32m2_t _p1; + vfloat32m2_t _p2; + vfloat32m2_t _p3; + vlseg4e32_v_f32m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse32_v_f32m2(outptr0, _p0, vl); + vse32_v_f32m2(outptr1, _p1, vl); + vse32_v_f32m2(outptr2, _p2, vl); + vse32_v_f32m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -558,7 +578,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); vfloat32m1_t _p0 = vle32_v_f32m1(r0, vl); vfloat32m1_t _p1 = vle32_v_f32m1(r1, vl); @@ -568,7 +588,7 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& vfloat32m1_t _p5 = vle32_v_f32m1(r5, vl); vfloat32m1_t _p6 = vle32_v_f32m1(r6, vl); vfloat32m1_t _p7 = vle32_v_f32m1(r7, vl); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e32_v_f32m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -618,17 +638,26 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vse32_v_f32m1(outptr0, vget_f32m1x8_f32m1(_p, 0), vl); - vse32_v_f32m1(outptr1, vget_f32m1x8_f32m1(_p, 1), vl); - vse32_v_f32m1(outptr2, vget_f32m1x8_f32m1(_p, 2), vl); - vse32_v_f32m1(outptr3, vget_f32m1x8_f32m1(_p, 3), vl); - vse32_v_f32m1(outptr4, vget_f32m1x8_f32m1(_p, 4), vl); - vse32_v_f32m1(outptr5, vget_f32m1x8_f32m1(_p, 5), vl); - vse32_v_f32m1(outptr6, vget_f32m1x8_f32m1(_p, 6), vl); - vse32_v_f32m1(outptr7, vget_f32m1x8_f32m1(_p, 7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vse32_v_f32m1(outptr0, _p0, vl); + vse32_v_f32m1(outptr1, _p1, vl); + vse32_v_f32m1(outptr2, _p2, vl); + vse32_v_f32m1(outptr3, _p3, vl); + vse32_v_f32m1(outptr4, _p4, vl); + vse32_v_f32m1(outptr5, _p5, vl); + vse32_v_f32m1(outptr6, _p6, vl); + vse32_v_f32m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -672,20 +701,21 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); + size_t vl = vsetvl_e32m1(n); - vfloat32m1x4_t _p0 = vlseg4e32_v_f32m1x4(r0, vl); - vfloat32m1x4_t _p1 = vlseg4e32_v_f32m1x4(r1, vl); + vfloat32m1_t _p00; + vfloat32m1_t _p01; + vfloat32m1_t _p02; + vfloat32m1_t _p03; + vlseg4e32_v_f32m1(&_p00, &_p01, &_p02, &_p03, r0, vl); - vfloat32m1_t _p00 = vget_f32m1x4_f32m1(_p0, 0); - vfloat32m1_t _p01 = vget_f32m1x4_f32m1(_p0, 1); - vfloat32m1_t _p02 = vget_f32m1x4_f32m1(_p0, 2); - vfloat32m1_t _p03 = vget_f32m1x4_f32m1(_p0, 3); - vfloat32m1_t _p10 = vget_f32m1x4_f32m1(_p1, 0); - vfloat32m1_t _p11 = vget_f32m1x4_f32m1(_p1, 1); - vfloat32m1_t _p12 = vget_f32m1x4_f32m1(_p1, 2); - vfloat32m1_t _p13 = vget_f32m1x4_f32m1(_p1, 3); - vsseg8e32_v_f32m1x8(outptr, vcreate_f32m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + vfloat32m1_t _p10; + vfloat32m1_t _p11; + vfloat32m1_t _p12; + vfloat32m1_t _p13; + vlseg4e32_v_f32m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e32_v_f32m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -725,19 +755,19 @@ int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& int n = size; while (n > 0) { - word_type vl = vsetvl_e32m1(n); - - vfloat32m1x8_t _p = vlseg8e32_v_f32m1x8(r0, vl); - vfloat32m1_t _p0 = vget_f32m1x8_f32m1(_p, 0); - vfloat32m1_t _p1 = vget_f32m1x8_f32m1(_p, 1); - vfloat32m1_t _p2 = vget_f32m1x8_f32m1(_p, 2); - vfloat32m1_t _p3 = vget_f32m1x8_f32m1(_p, 3); - vfloat32m1_t _p4 = vget_f32m1x8_f32m1(_p, 4); - vfloat32m1_t _p5 = vget_f32m1x8_f32m1(_p, 5); - vfloat32m1_t _p6 = vget_f32m1x8_f32m1(_p, 6); - vfloat32m1_t _p7 = vget_f32m1x8_f32m1(_p, 7); - vsseg4e32_v_f32m1x4(outptr0, vcreate_f32m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e32_v_f32m1x4(outptr1, vcreate_f32m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e32m1(n); + + vfloat32m1_t _p0; + vfloat32m1_t _p1; + vfloat32m1_t _p2; + vfloat32m1_t _p3; + vfloat32m1_t _p4; + vfloat32m1_t _p5; + vfloat32m1_t _p6; + vfloat32m1_t _p7; + vlseg8e32_v_f32m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vsseg4e32_v_f32m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e32_v_f32m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -859,13 +889,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m2(n); + size_t vl = vsetvl_e16m2(n); vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -903,13 +933,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m2(n); - - vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl); - vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl); - vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl); - vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl); - vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl); + size_t vl = vsetvl_e16m2(n); + + vuint16m2_t _p0; + vuint16m2_t _p1; + vuint16m2_t _p2; + vuint16m2_t _p3; + vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse16_v_u16m2(outptr0, _p0, vl); + vse16_v_u16m2(outptr1, _p1, vl); + vse16_v_u16m2(outptr2, _p2, vl); + vse16_v_u16m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -951,7 +985,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); @@ -961,7 +995,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -1011,17 +1045,26 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl); - vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl); - vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl); - vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl); - vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl); - vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl); - vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl); - vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vse16_v_u16m1(outptr0, _p0, vl); + vse16_v_u16m1(outptr1, _p1, vl); + vse16_v_u16m1(outptr2, _p2, vl); + vse16_v_u16m1(outptr3, _p3, vl); + vse16_v_u16m1(outptr4, _p4, vl); + vse16_v_u16m1(outptr5, _p5, vl); + vse16_v_u16m1(outptr6, _p6, vl); + vse16_v_u16m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -1065,19 +1108,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl); - vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl); - vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0); - vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1); - vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2); - vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3); - vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0); - vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1); - vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2); - vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p00; + vuint16m1_t _p01; + vuint16m1_t _p02; + vuint16m1_t _p03; + vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); + + vuint16m1_t _p10; + vuint16m1_t _p11; + vuint16m1_t _p12; + vuint16m1_t _p13; + vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); + + vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -1117,19 +1162,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = w; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0); - vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1); - vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2); - vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3); - vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4); - vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5); - vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6); - vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7); - vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; @@ -1188,13 +1234,13 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m2(n); + size_t vl = vsetvl_e16m2(n); vuint16m2_t _p0 = vle16_v_u16m2(r0, vl); vuint16m2_t _p1 = vle16_v_u16m2(r1, vl); vuint16m2_t _p2 = vle16_v_u16m2(r2, vl); vuint16m2_t _p3 = vle16_v_u16m2(r3, vl); - vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl); + vsseg4e16_v_u16m2(outptr, _p0, _p1, _p2, _p3, vl); r0 += vl; r1 += vl; @@ -1232,13 +1278,17 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m2(n); - - vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl); - vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl); - vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl); - vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl); - vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl); + size_t vl = vsetvl_e16m2(n); + + vuint16m2_t _p0; + vuint16m2_t _p1; + vuint16m2_t _p2; + vuint16m2_t _p3; + vlseg4e16_v_u16m2(&_p0, &_p1, &_p2, &_p3, r0, vl); + vse16_v_u16m2(outptr0, _p0, vl); + vse16_v_u16m2(outptr1, _p1, vl); + vse16_v_u16m2(outptr2, _p2, vl); + vse16_v_u16m2(outptr3, _p3, vl); r0 += vl * 4; outptr0 += vl; @@ -1280,7 +1330,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); vuint16m1_t _p0 = vle16_v_u16m1(r0, vl); vuint16m1_t _p1 = vle16_v_u16m1(r1, vl); @@ -1290,7 +1340,7 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co vuint16m1_t _p5 = vle16_v_u16m1(r5, vl); vuint16m1_t _p6 = vle16_v_u16m1(r6, vl); vuint16m1_t _p7 = vle16_v_u16m1(r7, vl); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl); + vsseg8e16_v_u16m1(outptr, _p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7, vl); r0 += vl; r1 += vl; @@ -1340,17 +1390,25 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl); - vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl); - vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl); - vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl); - vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl); - vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl); - vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl); - vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + vse16_v_u16m1(outptr0, _p0, vl); + vse16_v_u16m1(outptr1, _p1, vl); + vse16_v_u16m1(outptr2, _p2, vl); + vse16_v_u16m1(outptr3, _p3, vl); + vse16_v_u16m1(outptr4, _p4, vl); + vse16_v_u16m1(outptr5, _p5, vl); + vse16_v_u16m1(outptr6, _p6, vl); + vse16_v_u16m1(outptr7, _p7, vl); r0 += vl * 8; outptr0 += vl; @@ -1394,20 +1452,21 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p00; + vuint16m1_t _p01; + vuint16m1_t _p02; + vuint16m1_t _p03; + vlseg4e16_v_u16m1(&_p00, &_p01, &_p02, &_p03, r0, vl); - vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl); - vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl); + vuint16m1_t _p10; + vuint16m1_t _p11; + vuint16m1_t _p12; + vuint16m1_t _p13; + vlseg4e16_v_u16m1(&_p10, &_p11, &_p12, &_p13, r1, vl); - vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0); - vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1); - vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2); - vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3); - vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0); - vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1); - vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2); - vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3); - vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl); + vsseg8e16_v_u16m1(outptr, _p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13, vl); r0 += vl * 4; r1 += vl * 4; @@ -1447,19 +1506,20 @@ int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co int n = size; while (n > 0) { - word_type vl = vsetvl_e16m1(n); - - vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl); - vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0); - vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1); - vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2); - vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3); - vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4); - vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5); - vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6); - vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7); - vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl); - vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl); + size_t vl = vsetvl_e16m1(n); + + vuint16m1_t _p0; + vuint16m1_t _p1; + vuint16m1_t _p2; + vuint16m1_t _p3; + vuint16m1_t _p4; + vuint16m1_t _p5; + vuint16m1_t _p6; + vuint16m1_t _p7; + vlseg8e16_v_u16m1(&_p0, &_p1, &_p2, &_p3, &_p4, &_p5, &_p6, &_p7, r0, vl); + + vsseg4e16_v_u16m1(outptr0, _p0, _p1, _p2, _p3, vl); + vsseg4e16_v_u16m1(outptr1, _p4, _p5, _p6, _p7, vl); r0 += vl * 8; outptr0 += vl * 4; diff --git a/src/layer/riscv/padding_packn.h b/src/layer/riscv/padding_packn.h index 1f93ecfe92d..50f5efe1216 100644 --- a/src/layer/riscv/padding_packn.h +++ b/src/layer/riscv/padding_packn.h @@ -16,7 +16,7 @@ static void padding_constant_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right, v##VT##m##LMUL##_t v) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -65,7 +65,7 @@ static void padding_replicate_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ @@ -144,7 +144,7 @@ static void padding_reflect_packn_##VT##_rvv(const Mat& src, Mat& dst, int top, int bottom, int left, int right) \ { \ const int packn = csrr_vlenb() / sizeof(T); \ - const word_type vl = vsetvl_e##SEW##m##LMUL(packn); \ + const size_t vl = vsetvl_e##SEW##m##LMUL(packn); \ \ const T* ptr = src; \ T* outptr = dst; \ diff --git a/src/layer/riscv/padding_riscv.cpp b/src/layer/riscv/padding_riscv.cpp index de29af0f6bf..8f4b54da590 100644 --- a/src/layer/riscv/padding_riscv.cpp +++ b/src/layer/riscv/padding_riscv.cpp @@ -91,7 +91,7 @@ int Padding_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -261,7 +261,7 @@ int Padding_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, co { #if __riscv_vector const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); #endif int w = bottom_blob.w; @@ -511,7 +511,7 @@ int Padding_riscv::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Opt { #if __riscv_vector const int packn = csrr_vlenb() / 1; - const word_type vl = vsetvl_e8m1(packn); + const size_t vl = vsetvl_e8m1(packn); #endif int w = bottom_blob.w; diff --git a/src/layer/riscv/pooling_riscv.cpp b/src/layer/riscv/pooling_riscv.cpp index 0ca4e3d894c..1b4c1f0ed8a 100644 --- a/src/layer/riscv/pooling_riscv.cpp +++ b/src/layer/riscv/pooling_riscv.cpp @@ -72,7 +72,7 @@ int Pooling_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& #if __riscv_vector const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); #endif int w = bottom_blob.w; @@ -315,7 +315,7 @@ int Pooling_riscv::forward_fp16s(const Mat& bottom_blob, Mat& top_blob, const Op // avg value in NxN window const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; @@ -721,7 +721,7 @@ int Pooling_riscv::forward_fp16sa(const Mat& bottom_blob, Mat& top_blob, const O } const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int w = bottom_blob.w; int h = bottom_blob.h; diff --git a/src/layer/riscv/prelu_riscv.cpp b/src/layer/riscv/prelu_riscv.cpp index c25223461a1..32cb77023b4 100644 --- a/src/layer/riscv/prelu_riscv.cpp +++ b/src/layer/riscv/prelu_riscv.cpp @@ -63,7 +63,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -84,7 +84,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -115,7 +115,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -135,7 +135,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -170,7 +170,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); @@ -191,7 +191,7 @@ int PReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -303,7 +303,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -324,7 +324,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -355,7 +355,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(ptr_slope, vl); @@ -375,7 +375,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -410,7 +410,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e16m4(n1); + size_t vl = vsetvl_e16m4(n1); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vfloat32m8_t _slope = vle32_v_f32m8(slope_ptr, vl); @@ -431,7 +431,7 @@ int PReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, .0f, vl); @@ -468,7 +468,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); vbool4_t _lower = vmflt_vf_f16m4_b4(_p, .0f, vl); @@ -489,7 +489,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) // #pragma omp parallel for num_threads(opt.num_threads) while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); @@ -520,7 +520,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(ptr_slope, vl), vl); @@ -540,7 +540,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); @@ -575,7 +575,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) const float* slope_ptr = (const float*)slope_data + q * elempack; while (n1 > 0) { - word_type vl = vsetvl_e16m4(n1); + size_t vl = vsetvl_e16m4(n1); vfloat16m4_t _p = vle16_v_f16m4(ptr, vl); vfloat16m4_t _slope = vfncvt_f_f_w_f16m4(vle32_v_f32m8(slope_ptr, vl), vl); @@ -596,7 +596,7 @@ int PReLU_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) float slope = num_slope > 1 ? slope_data[q] : slope_data[0]; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); vbool2_t _lower = vmflt_vf_f16m8_b2(_p, .0f, vl); diff --git a/src/layer/riscv/relu_riscv.cpp b/src/layer/riscv/relu_riscv.cpp index 6b23ebc3a63..cf2d4057069 100644 --- a/src/layer/riscv/relu_riscv.cpp +++ b/src/layer/riscv/relu_riscv.cpp @@ -58,10 +58,10 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); - _p = vfmax_vf_f32m8(_p, (float32_t)0.f, vl); + _p = vfmax_vf_f32m8(_p, 0.f, vl); vse32_v_f32m8(ptr, _p, vl); ptr += vl; @@ -82,7 +82,7 @@ int ReLU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfmul_vf_f32m8_m(vmflt_vf_f32m8_b4(_p, .0f, vl), _p, _p, slope, vl); //slope: float(float32_t) @@ -124,10 +124,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); - _p = vfmax_vf_f16m8(_p, (float16_t)0.f, vl); + _p = vfmax_vf_f16m8(_p, (__fp16)0.f, vl); vse16_v_f16m8(ptr, _p, vl); ptr += vl; @@ -137,10 +137,10 @@ int ReLU_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c else { int n = size; - float16_t _slope = (float16_t)slope; + __fp16 _slope = (__fp16)slope; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfmul_vf_f16m8_m(vmflt_vf_f16m8_b2(_p, .0f, vl), _p, _p, _slope, vl); diff --git a/src/layer/riscv/riscv_activation.h b/src/layer/riscv/riscv_activation.h index 763e719b15d..d5f114f3aaa 100644 --- a/src/layer/riscv/riscv_activation.h +++ b/src/layer/riscv/riscv_activation.h @@ -22,49 +22,49 @@ #include "rvv_mathfun.h" #include "rvv_mathfun_fp16s.h" -#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN) \ - static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, word_type vl) \ - { \ - if (activation_type == 1) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl); \ - } \ - else if (activation_type == 2) \ - { \ - vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl); \ - _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl); \ - } \ - else if (activation_type == 3) \ - { \ - _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl); \ - _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl); \ - } \ - else if (activation_type == 4) \ - { \ - _v = sigmoid_ps(_v, vl); \ - } \ - else if (activation_type == 5) \ - { \ - _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl); \ - } \ - else if (activation_type == 6) \ - { \ - const float alpha = activation_params[0]; \ - const float beta = activation_params[1]; \ - const float lower = -beta / alpha; \ - const float upper = (1.f / alpha) + lower; \ - vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl); \ - vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl); \ - vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl); \ - _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl); \ - \ - vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m( \ - _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta, \ - vl); \ - _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl); \ - } \ - \ - return _v; \ +#define _RVV_FLOAT_ACTIVATION_PS(SEW, LMUL, MLEN) \ + static inline vfloat##SEW##m##LMUL##_t activation_ps(vfloat##SEW##m##LMUL##_t _v, int activation_type, const ncnn::Mat& activation_params, size_t vl) \ + { \ + if (activation_type == 1) \ + { \ + _v = vfmax_vf_f##SEW##m##LMUL(_v, 0.f, vl); \ + } \ + else if (activation_type == 2) \ + { \ + vbool##MLEN##_t _lemask = vmfle_vf_f##SEW##m##LMUL##_b##MLEN(_v, 0.f, vl); \ + _v = vfmul_vf_f##SEW##m##LMUL##_m(_lemask, _v, _v, activation_params[0], vl); \ + } \ + else if (activation_type == 3) \ + { \ + _v = vfmax_vf_f##SEW##m##LMUL(_v, activation_params[0], vl); \ + _v = vfmin_vf_f##SEW##m##LMUL(_v, activation_params[1], vl); \ + } \ + else if (activation_type == 4) \ + { \ + _v = sigmoid_ps(_v, vl); \ + } \ + else if (activation_type == 5) \ + { \ + _v = vfmul_vv_f##SEW##m##LMUL(_v, tanh_ps(log_ps(vfadd_vf_f##SEW##m##LMUL(exp_ps(_v, vl), 1.f, vl), vl), vl), vl); \ + } \ + else if (activation_type == 6) \ + { \ + const float alpha = activation_params[0]; \ + const float beta = activation_params[1]; \ + const float lower = -beta / alpha; \ + const float upper = (1.f / alpha) + lower; \ + vbool##MLEN##_t _lower = vmflt_vf_f##SEW##m##LMUL##_b##MLEN(_v, lower, vl); \ + vbool##MLEN##_t _higher = vmfgt_vf_f##SEW##m##LMUL##_b##MLEN(_v, upper, vl); \ + vbool##MLEN##_t _apply = vmnor_mm_b##MLEN(_lower, _higher, vl); \ + _v = vfmerge_vfm_f##SEW##m##LMUL(_lower, _v, .0f, vl); \ + \ + vfloat##SEW##m##LMUL##_t _p0 = vfadd_vf_f##SEW##m##LMUL##_m( \ + _apply, _v, /*op1*/ vfmul_vf_f##SEW##m##LMUL##_m(_apply, _v, _v, alpha, vl), beta, \ + vl); \ + _v = vfmul_vv_f##SEW##m##LMUL##_m(_apply, _v, /*op1*/ _v, _p0, vl); \ + } \ + \ + return _v; \ } _RVV_FLOAT_ACTIVATION_PS(16, 1, 16) diff --git a/src/layer/riscv/riscv_usability.h b/src/layer/riscv/riscv_usability.h index f60faad50f7..596bf4435c6 100644 --- a/src/layer/riscv/riscv_usability.h +++ b/src/layer/riscv/riscv_usability.h @@ -53,7 +53,7 @@ static inline int csrr_vlenb() static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) { const int packn = csrr_vlenb() / 4; - const word_type vl = vsetvl_e32m8(packn * 8); + const size_t vl = vsetvl_e32m8(packn * 8); // NOTE vloxei8_v_f32m8 gets illegal instruction on d1 --- nihui @@ -90,7 +90,7 @@ static inline vfloat32m8_t vle32_v_f32m8_f32m1(const float* ptr) static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) { const int packn = csrr_vlenb() / 2; - const word_type vl = vsetvl_e16m8(packn * 8); + const size_t vl = vsetvl_e16m8(packn * 8); // NOTE vloxei8_v_f16m8 gets illegal instruction on d1 --- nihui @@ -125,4 +125,278 @@ static inline vfloat16m8_t vle16_v_f16m8_f16m1(const __fp16* ptr) #endif // __riscv_zfh #endif // __riscv_vector +#if __riscv_vector && __rvv_tuple + +// f32m1, vsseg.v +static inline void vsseg8e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +{ + vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e32_v_f32m1x8(base, _tmp, vl); +} + +static inline void vsseg4e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +{ + vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); + vsseg4e32_v_f32m1x4(base, _tmp, vl); +} + +static inline void vsseg2e32_v_f32m1(float32_t* base, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +{ + vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); + vsseg2e32_v_f32m1x2(base, _tmp, vl); +} + +// f32m1, vssseg.v, 8/4/2 +static inline void vssseg8e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, vfloat32m1_t v4, vfloat32m1_t v5, vfloat32m1_t v6, vfloat32m1_t v7, size_t vl) +{ + vfloat32m1x8_t _tmp = vcreate_f32m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vssseg8e32_v_f32m1x8(base, bstride, _tmp, vl); +} + +static inline void vssseg4e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, vfloat32m1_t v2, vfloat32m1_t v3, size_t vl) +{ + vfloat32m1x4_t _tmp = vcreate_f32m1x4(v0, v1, v2, v3); + vssseg4e32_v_f32m1x4(base, bstride, _tmp, vl); +} + +static inline void vssseg2e32_v_f32m1(float32_t* base, ptrdiff_t bstride, vfloat32m1_t v0, vfloat32m1_t v1, size_t vl) +{ + vfloat32m1x2_t _tmp = vcreate_f32m1x2(v0, v1); + vssseg2e32_v_f32m1x2(base, bstride, _tmp, vl); +} + +// f32m2, vsseg.v, 4/2 +static inline void vsseg4e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, vfloat32m2_t v2, vfloat32m2_t v3, size_t vl) +{ + vfloat32m2x4_t _tmp = vcreate_f32m2x4(v0, v1, v2, v3); + vsseg4e32_v_f32m2x4(base, _tmp, vl); +} + +static inline void vsseg2e32_v_f32m2(float32_t* base, vfloat32m2_t v0, vfloat32m2_t v1, size_t vl) +{ + vfloat32m2x2_t _tmp = vcreate_f32m2x2(v0, v1); + vsseg2e32_v_f32m2x2(base, _tmp, vl); +} + +// u16m1, vsseg.v, 8/4 +static inline void vsseg8e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, vuint16m1_t v4, vuint16m1_t v5, vuint16m1_t v6, vuint16m1_t v7, size_t vl) +{ + vuint16m1x8_t _tmp = vcreate_u16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e16_v_u16m1x8(base, _tmp, vl); +} + +static inline void vsseg4e16_v_u16m1(uint16_t* base, vuint16m1_t v0, vuint16m1_t v1, vuint16m1_t v2, vuint16m1_t v3, size_t vl) +{ + vuint16m1x4_t _tmp = vcreate_u16m1x4(v0, v1, v2, v3); + vsseg4e16_v_u16m1x4(base, _tmp, vl); +} + +// u16m2, vsseg.v, 4/2 +static inline void vsseg4e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, vuint16m2_t v2, vuint16m2_t v3, size_t vl) +{ + vuint16m2x4_t _tmp = vcreate_u16m2x4(v0, v1, v2, v3); + vsseg4e16_v_u16m2x4(base, _tmp, vl); +} + +static inline void vsseg2e16_v_u16m2(uint16_t* base, vuint16m2_t v0, vuint16m2_t v1, size_t vl) +{ + vuint16m2x2_t _tmp = vcreate_u16m2x2(v0, v1); + vsseg2e16_v_u16m2x2(base, _tmp, vl); +} + +// f32m1, vlseg.v 8/4/2 +static inline void vlseg8e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, vfloat32m1_t* v4, vfloat32m1_t* v5, vfloat32m1_t* v6, vfloat32m1_t* v7, const float32_t* base, size_t vl) +{ + vfloat32m1x8_t _tmp = vlseg8e32_v_f32m1x8(base, vl); + *v0 = vget_f32m1x8_f32m1(_tmp, 0); + *v1 = vget_f32m1x8_f32m1(_tmp, 1); + *v2 = vget_f32m1x8_f32m1(_tmp, 2); + *v3 = vget_f32m1x8_f32m1(_tmp, 3); + *v4 = vget_f32m1x8_f32m1(_tmp, 4); + *v5 = vget_f32m1x8_f32m1(_tmp, 5); + *v6 = vget_f32m1x8_f32m1(_tmp, 6); + *v7 = vget_f32m1x8_f32m1(_tmp, 7); +} + +static inline void vlseg4e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, vfloat32m1_t* v2, vfloat32m1_t* v3, const float32_t* base, size_t vl) +{ + vfloat32m1x4_t _tmp = vlseg4e32_v_f32m1x4(base, vl); + *v0 = vget_f32m1x4_f32m1(_tmp, 0); + *v1 = vget_f32m1x4_f32m1(_tmp, 1); + *v2 = vget_f32m1x4_f32m1(_tmp, 2); + *v3 = vget_f32m1x4_f32m1(_tmp, 3); +} + +static inline void vlseg2e32_v_f32m1(vfloat32m1_t* v0, vfloat32m1_t* v1, const float32_t* base, size_t vl) +{ + vfloat32m1x2_t _tmp = vlseg2e32_v_f32m1x2(base, vl); + *v0 = vget_f32m1x2_f32m1(_tmp, 0); + *v1 = vget_f32m1x2_f32m1(_tmp, 1); +} + +// f32m2, vlseg.v, 4 +static inline void vlseg4e32_v_f32m2(vfloat32m2_t* v0, vfloat32m2_t* v1, vfloat32m2_t* v2, vfloat32m2_t* v3, const float32_t* base, size_t vl) +{ + vfloat32m2x4_t _tmp = vlseg4e32_v_f32m2x4(base, vl); + *v0 = vget_f32m2x4_f32m2(_tmp, 0); + *v1 = vget_f32m2x4_f32m2(_tmp, 1); + *v2 = vget_f32m2x4_f32m2(_tmp, 2); + *v3 = vget_f32m2x4_f32m2(_tmp, 3); +} + +// f32m4, vlseg.v, 2 +static inline void vlseg2e32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, size_t vl) +{ + vfloat32m4x2_t _tmp = vlseg2e32_v_f32m4x2(base, vl); + *v0 = vget_f32m4x2_f32m4(_tmp, 0); + *v1 = vget_f32m4x2_f32m4(_tmp, 1); +} + +// f32m4, vloxseg.v +static inline void vloxseg2ei32_v_f32m4(vfloat32m4_t* v0, vfloat32m4_t* v1, const float32_t* base, vuint32m4_t bindex, size_t vl) +{ + vfloat32m4x2_t _tmp = vloxseg2ei32_v_f32m4x2(base, bindex, vl); + *v0 = vget_f32m4x2_f32m4(_tmp, 0); + *v1 = vget_f32m4x2_f32m4(_tmp, 1); +} + +// u16m1, vlseg.v 8/4/2 +static inline void vlseg8e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, vuint16m1_t* v4, vuint16m1_t* v5, vuint16m1_t* v6, vuint16m1_t* v7, const uint16_t* base, size_t vl) +{ + vuint16m1x8_t _tmp = vlseg8e16_v_u16m1x8(base, vl); + *v0 = vget_u16m1x8_u16m1(_tmp, 0); + *v1 = vget_u16m1x8_u16m1(_tmp, 1); + *v2 = vget_u16m1x8_u16m1(_tmp, 2); + *v3 = vget_u16m1x8_u16m1(_tmp, 3); + *v4 = vget_u16m1x8_u16m1(_tmp, 4); + *v5 = vget_u16m1x8_u16m1(_tmp, 5); + *v6 = vget_u16m1x8_u16m1(_tmp, 6); + *v7 = vget_u16m1x8_u16m1(_tmp, 7); +} + +static inline void vlseg4e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, vuint16m1_t* v2, vuint16m1_t* v3, const uint16_t* base, size_t vl) +{ + vuint16m1x4_t _tmp = vlseg4e16_v_u16m1x4(base, vl); + *v0 = vget_u16m1x4_u16m1(_tmp, 0); + *v1 = vget_u16m1x4_u16m1(_tmp, 1); + *v2 = vget_u16m1x4_u16m1(_tmp, 2); + *v3 = vget_u16m1x4_u16m1(_tmp, 3); +} + +static inline void vlseg2e16_v_u16m1(vuint16m1_t* v0, vuint16m1_t* v1, const uint16_t* base, size_t vl) +{ + vuint16m1x2_t _tmp = vlseg2e16_v_u16m1x2(base, vl); + *v0 = vget_u16m1x2_u16m1(_tmp, 0); + *v1 = vget_u16m1x2_u16m1(_tmp, 1); +} + +// u16m2, vlseg.v, 4 +static inline void vlseg4e16_v_u16m2(vuint16m2_t* v0, vuint16m2_t* v1, vuint16m2_t* v2, vuint16m2_t* v3, const uint16_t* base, size_t vl) +{ + vuint16m2x4_t _tmp = vlseg4e16_v_u16m2x4(base, vl); + *v0 = vget_u16m2x4_u16m2(_tmp, 0); + *v1 = vget_u16m2x4_u16m2(_tmp, 1); + *v2 = vget_u16m2x4_u16m2(_tmp, 2); + *v3 = vget_u16m2x4_u16m2(_tmp, 3); +} + +// u16m4, vlseg.v, 2 +static inline void vlseg2e16_v_u16m4(vuint16m4_t* v0, vuint16m4_t* v1, const uint16_t* base, size_t vl) +{ + vuint16m4x2_t _tmp = vlseg2e16_v_u16m4x2(base, vl); + *v0 = vget_u16m4x2_u16m4(_tmp, 0); + *v1 = vget_u16m4x2_u16m4(_tmp, 1); +} + +#if __riscv_zfh + +// f16m1, vsseg.v, 8/4/2 +static inline void vsseg8e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) +{ + vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vsseg8e16_v_f16m1x8(base, _tmp, vl); +} + +static inline void vsseg4e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) +{ + vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); + vsseg4e16_v_f16m1x4(base, _tmp, vl); +} + +static inline void vsseg2e16_v_f16m1(float16_t* base, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) +{ + vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); + vsseg2e16_v_f16m1x2(base, _tmp, vl); +} + +// f16m1, vssseg.v, 8/4/2 +static inline void vssseg8e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, vfloat16m1_t v4, vfloat16m1_t v5, vfloat16m1_t v6, vfloat16m1_t v7, size_t vl) +{ + vfloat16m1x8_t _tmp = vcreate_f16m1x8(v0, v1, v2, v3, v4, v5, v6, v7); + vssseg8e16_v_f16m1x8(base, bstride, _tmp, vl); +} + +static inline void vssseg4e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, vfloat16m1_t v2, vfloat16m1_t v3, size_t vl) +{ + vfloat16m1x4_t _tmp = vcreate_f16m1x4(v0, v1, v2, v3); + vssseg4e16_v_f16m1x4(base, bstride, _tmp, vl); +} + +static inline void vssseg2e16_v_f16m1(float16_t* base, ptrdiff_t bstride, vfloat16m1_t v0, vfloat16m1_t v1, size_t vl) +{ + vfloat16m1x2_t _tmp = vcreate_f16m1x2(v0, v1); + vssseg2e16_v_f16m1x2(base, bstride, _tmp, vl); +} + +// f16m1, vlseg.v 8/4/2 +static inline void vlseg8e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, vfloat16m1_t* v4, vfloat16m1_t* v5, vfloat16m1_t* v6, vfloat16m1_t* v7, const float16_t* base, size_t vl) +{ + vfloat16m1x8_t _tmp = vlseg8e16_v_f16m1x8(base, vl); + *v0 = vget_f16m1x8_f16m1(_tmp, 0); + *v1 = vget_f16m1x8_f16m1(_tmp, 1); + *v2 = vget_f16m1x8_f16m1(_tmp, 2); + *v3 = vget_f16m1x8_f16m1(_tmp, 3); + *v4 = vget_f16m1x8_f16m1(_tmp, 4); + *v5 = vget_f16m1x8_f16m1(_tmp, 5); + *v6 = vget_f16m1x8_f16m1(_tmp, 6); + *v7 = vget_f16m1x8_f16m1(_tmp, 7); +} + +static inline void vlseg4e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, vfloat16m1_t* v2, vfloat16m1_t* v3, const float16_t* base, size_t vl) +{ + vfloat16m1x4_t _tmp = vlseg4e16_v_f16m1x4(base, vl); + *v0 = vget_f16m1x4_f16m1(_tmp, 0); + *v1 = vget_f16m1x4_f16m1(_tmp, 1); + *v2 = vget_f16m1x4_f16m1(_tmp, 2); + *v3 = vget_f16m1x4_f16m1(_tmp, 3); +} + +static inline void vlseg2e16_v_f16m1(vfloat16m1_t* v0, vfloat16m1_t* v1, const float16_t* base, size_t vl) +{ + vfloat16m1x2_t _tmp = vlseg2e16_v_f16m1x2(base, vl); + *v0 = vget_f16m1x2_f16m1(_tmp, 0); + *v1 = vget_f16m1x2_f16m1(_tmp, 1); +} + +// f16m2, vlseg.v, 4 +static inline void vlseg4e16_v_f16m2(vfloat16m2_t* v0, vfloat16m2_t* v1, vfloat16m2_t* v2, vfloat16m2_t* v3, const float16_t* base, size_t vl) +{ + vfloat16m2x4_t _tmp = vlseg4e16_v_f16m2x4(base, vl); + *v0 = vget_f16m2x4_f16m2(_tmp, 0); + *v1 = vget_f16m2x4_f16m2(_tmp, 1); + *v2 = vget_f16m2x4_f16m2(_tmp, 2); + *v3 = vget_f16m2x4_f16m2(_tmp, 3); +} + +// f16m4, vlseg.v, 2 +static inline void vlseg2e16_v_f16m4(vfloat16m4_t* v0, vfloat16m4_t* v1, const float16_t* base, size_t vl) +{ + vfloat16m4x2_t _tmp = vlseg2e16_v_f16m4x2(base, vl); + *v0 = vget_f16m4x2_f16m4(_tmp, 0); + *v1 = vget_f16m4x2_f16m4(_tmp, 1); +} + +#endif // __riscv_zfh +#endif // __riscv_vector + #endif // RISCV_USABILITY_H diff --git a/src/layer/riscv/rvv_mathfun.h b/src/layer/riscv/rvv_mathfun.h index 8993b5ad8e6..aa966de6c86 100644 --- a/src/layer/riscv/rvv_mathfun.h +++ b/src/layer/riscv/rvv_mathfun.h @@ -32,7 +32,7 @@ #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT32_LOG_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t log_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ x = vfmax_vf_f32m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = vmfle_vf_f32m##LMUL##_b##MLEN(x, 0.f, vl); \ @@ -118,7 +118,7 @@ _RVV_FLOAT32_LOG_OP(8, 4) #define c_cephes_exp_p5 5.0000001201E-1 #define _RVV_FLOAT32_EXP_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t exp_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t tmp, fx; \ \ @@ -184,7 +184,7 @@ _RVV_FLOAT32_EXP_OP(8, 4) #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT32_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, word_type vl) \ + static inline void sincos_ps(vfloat32m##LMUL##_t x, vfloat32m##LMUL##_t* ysin, vfloat32m##LMUL##_t* ycos, size_t vl) \ { \ /* any x */ \ vfloat32m##LMUL##_t xmm1, xmm2, xmm3, y; \ @@ -257,12 +257,12 @@ _RVV_FLOAT32_SINCOS_OP(2, 16) _RVV_FLOAT32_SINCOS_OP(4, 8) _RVV_FLOAT32_SINCOS_OP(8, 4) -#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ysin; \ +#define _RVV_FLOAT32_SIN_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t sin_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ysin; \ } _RVV_FLOAT32_SIN_OP(1, 32) @@ -270,12 +270,12 @@ _RVV_FLOAT32_SIN_OP(2, 16) _RVV_FLOAT32_SIN_OP(4, 8) _RVV_FLOAT32_SIN_OP(8, 4) -#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, word_type vl) \ - { \ - vfloat32m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ycos; \ +#define _RVV_FLOAT32_COS_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t cos_ps(vfloat32m##LMUL##_t x, size_t vl) \ + { \ + vfloat32m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ycos; \ } _RVV_FLOAT32_COS_OP(1, 32) @@ -293,7 +293,7 @@ _RVV_FLOAT32_COS_OP(8, 4) #define c_cephes_tanh_p4 -3.33332819422E-1 #define _RVV_FLOAT32_TANH_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t tanh_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ vfloat32m##LMUL##_t x2 = vfsgnj_vf_f32m##LMUL(x, 1.f, vl); \ \ @@ -341,11 +341,11 @@ _RVV_FLOAT32_TANH_OP(2, 16) _RVV_FLOAT32_TANH_OP(4, 8) _RVV_FLOAT32_TANH_OP(8, 4) -#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, word_type vl) \ - { \ - /* pow(x, m) = exp(m * log(x)) */ \ - return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ +#define _RVV_FLOAT32_POW_OP(LMUL, MLEN) \ + static inline vfloat32m##LMUL##_t pow_ps(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, size_t vl) \ + { \ + /* pow(x, m) = exp(m * log(x)) */ \ + return exp_ps(vfmul_vv_f32m##LMUL(b, log_ps(a, vl), vl), vl); \ } _RVV_FLOAT32_POW_OP(1, 32) @@ -354,7 +354,7 @@ _RVV_FLOAT32_POW_OP(4, 8) _RVV_FLOAT32_POW_OP(8, 4) #define _RVV_FLOAT32_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, word_type vl) \ + static inline vfloat32m##LMUL##_t sigmoid_ps(vfloat32m##LMUL##_t _v, size_t vl) \ { \ _v = vfneg_v_f32m##LMUL(_v, vl); \ _v = exp_ps(_v, vl); \ @@ -447,8 +447,8 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4) #define c_erfc_sb7 -2.2440952301e+01f /* 0xc1b38712 */ #define _RVV_FLOAT32_FMA_HELPER(LMUL) \ - static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float32_t b, \ - float32_t c, word_type vl) \ + static inline vfloat32m##LMUL##_t vfmadd_vff_f32m##LMUL(vfloat32m##LMUL##_t a, float b, \ + float c, size_t vl) \ { \ vfloat32m##LMUL##_t ret = vfmul_vf_f32m##LMUL(a, b, vl); \ ret = vfadd_vf_f32m##LMUL(ret, c, vl); \ @@ -456,7 +456,7 @@ _RVV_FLOAT32_SIGMOID_OP(8, 4) } \ \ static inline vfloat32m##LMUL##_t vfmadd_vvf_f32m##LMUL(vfloat32m##LMUL##_t a, vfloat32m##LMUL##_t b, \ - float32_t c, word_type vl) \ + float c, size_t vl) \ { \ vfloat32m##LMUL##_t ret = vfmul_vv_f32m##LMUL(a, b, vl); \ ret = vfadd_vf_f32m##LMUL(ret, c, vl); \ @@ -469,7 +469,7 @@ _RVV_FLOAT32_FMA_HELPER(2) _RVV_FLOAT32_FMA_HELPER(1) #define _RVV_FLOAT32_ERFC_OP(LMUL, MLEN) \ - static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, word_type vl) \ + static inline vfloat32m##LMUL##_t erfc_ps(vfloat32m##LMUL##_t x, size_t vl) \ { \ /* Argument for polys */ \ vfloat32m##LMUL##_t absx = vfsgnjx_vv_f32m##LMUL(x, x, vl); \ diff --git a/src/layer/riscv/rvv_mathfun_fp16s.h b/src/layer/riscv/rvv_mathfun_fp16s.h index 129a4f94037..e7f18b961ae 100644 --- a/src/layer/riscv/rvv_mathfun_fp16s.h +++ b/src/layer/riscv/rvv_mathfun_fp16s.h @@ -32,7 +32,7 @@ #define c_cephes_log_q2 0.693359375 #define _RVV_FLOAT16_LOG_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t log_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ x = vfmax_vf_f16m##LMUL(x, 0.f, vl); /* force flush to zero on denormal values */ \ vbool##MLEN##_t invalid_mask = vmfle_vf_f16m##LMUL##_b##MLEN(x, 0.f, vl); \ @@ -118,7 +118,7 @@ _RVV_FLOAT16_LOG_OP(8, 2) #define c_cephes_exp_p5 5.0000001201E-1 #define _RVV_FLOAT16_EXP_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t exp_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ vfloat16m##LMUL##_t tmp, fx; \ \ @@ -184,7 +184,7 @@ _RVV_FLOAT16_EXP_OP(8, 2) #define c_cephes_FOPI 1.27323954473516 // 4 / M_PI #define _RVV_FLOAT16_SINCOS_OP(LMUL, MLEN) \ - static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, word_type vl) \ + static inline void sincos_ps(vfloat16m##LMUL##_t x, vfloat16m##LMUL##_t* ysin, vfloat16m##LMUL##_t* ycos, size_t vl) \ { \ /* any x */ \ vfloat16m##LMUL##_t xmm1, xmm2, xmm3, y; \ @@ -257,12 +257,12 @@ _RVV_FLOAT16_SINCOS_OP(2, 8) _RVV_FLOAT16_SINCOS_OP(4, 4) _RVV_FLOAT16_SINCOS_OP(8, 2) -#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, word_type vl) \ - { \ - vfloat16m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ysin; \ +#define _RVV_FLOAT16_SIN_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t sin_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ysin; \ } _RVV_FLOAT16_SIN_OP(1, 16) @@ -270,12 +270,12 @@ _RVV_FLOAT16_SIN_OP(2, 8) _RVV_FLOAT16_SIN_OP(4, 4) _RVV_FLOAT16_SIN_OP(8, 2) -#define _RVV_FLOAT16_COS_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, word_type vl) \ - { \ - vfloat16m##LMUL##_t ysin, ycos; \ - sincos_ps(x, &ysin, &ycos, vl); \ - return ycos; \ +#define _RVV_FLOAT16_COS_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t cos_ps(vfloat16m##LMUL##_t x, size_t vl) \ + { \ + vfloat16m##LMUL##_t ysin, ycos; \ + sincos_ps(x, &ysin, &ycos, vl); \ + return ycos; \ } _RVV_FLOAT16_COS_OP(1, 16) @@ -293,7 +293,7 @@ _RVV_FLOAT16_COS_OP(8, 2) #define c_cephes_tanh_p4 -3.33332819422E-1 #define _RVV_FLOAT16_TANH_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, word_type vl) \ + static inline vfloat16m##LMUL##_t tanh_ps(vfloat16m##LMUL##_t x, size_t vl) \ { \ vfloat16m##LMUL##_t x2 = vfsgnj_vf_f16m##LMUL(x, 1.f, vl); \ \ @@ -341,11 +341,11 @@ _RVV_FLOAT16_TANH_OP(2, 8) _RVV_FLOAT16_TANH_OP(4, 4) _RVV_FLOAT16_TANH_OP(8, 2) -#define _RVV_FLOAT16_POW_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, word_type vl) \ - { \ - /* pow(x, m) = exp(m * log(x)) */ \ - return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl); \ +#define _RVV_FLOAT16_POW_OP(LMUL, MLEN) \ + static inline vfloat16m##LMUL##_t pow_ps(vfloat16m##LMUL##_t a, vfloat16m##LMUL##_t b, size_t vl) \ + { \ + /* pow(x, m) = exp(m * log(x)) */ \ + return exp_ps(vfmul_vv_f16m##LMUL(b, log_ps(a, vl), vl), vl); \ } _RVV_FLOAT16_POW_OP(1, 16) @@ -354,7 +354,7 @@ _RVV_FLOAT16_POW_OP(4, 4) _RVV_FLOAT16_POW_OP(8, 2) #define _RVV_FLOAT16_SIGMOID_OP(LMUL, MLEN) \ - static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, word_type vl) \ + static inline vfloat16m##LMUL##_t sigmoid_ps(vfloat16m##LMUL##_t _v, size_t vl) \ { \ _v = vfneg_v_f16m##LMUL(_v, vl); \ _v = exp_ps(_v, vl); \ diff --git a/src/layer/riscv/selu_riscv.cpp b/src/layer/riscv/selu_riscv.cpp index 9a4939c8421..932db355cc2 100644 --- a/src/layer/riscv/selu_riscv.cpp +++ b/src/layer/riscv/selu_riscv.cpp @@ -39,7 +39,7 @@ int SELU_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vbool4_t _lower = vmflt_vf_f32m8_b4(_p, 0.f, vl); vbool4_t _higher = vmnot_m_b4(_lower, vl); diff --git a/src/layer/riscv/sigmoid_riscv.cpp b/src/layer/riscv/sigmoid_riscv.cpp index afd07ea2b38..6c10582c668 100644 --- a/src/layer/riscv/sigmoid_riscv.cpp +++ b/src/layer/riscv/sigmoid_riscv.cpp @@ -64,7 +64,7 @@ int Sigmoid_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = sigmoid_ps(_p, vl); @@ -104,7 +104,7 @@ int Sigmoid_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = sigmoid_ps(_p, vl); @@ -135,7 +135,7 @@ int Sigmoid_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& op int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = sigmoid_ps(_p, vl); diff --git a/src/layer/riscv/softmax_riscv.cpp b/src/layer/riscv/softmax_riscv.cpp index 7a93e5de18d..ca910c3d3c0 100644 --- a/src/layer/riscv/softmax_riscv.cpp +++ b/src/layer/riscv/softmax_riscv.cpp @@ -44,7 +44,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); vfloat32m1_t _max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); @@ -61,7 +61,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m1_t _sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); @@ -80,7 +80,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons ptr_vol = ptr; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr_vol, vl); _p = vfdiv_vf_f32m8(_p, sum, vl); @@ -112,7 +112,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -141,7 +141,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); @@ -168,7 +168,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -198,7 +198,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr1 = ptr; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr1, vl); vfloat32m1_t _m = vfmv_s_f_f32m1(vundefined_f32m1(), m, vl); @@ -215,7 +215,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr2 = ptr; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _p = vle32_v_f32m8(ptr2, vl); vfloat32m1_t _s = vfmv_s_f_f32m1(vundefined_f32m1(), s, vl); @@ -233,7 +233,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr3 = ptr; while (n3 > 0) { - word_type vl = vsetvl_e32m8(n3); + size_t vl = vsetvl_e32m8(n3); vfloat32m8_t _p = vle32_v_f32m8(ptr3, vl); @@ -269,7 +269,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _max = vle32_v_f32m8(max, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -295,7 +295,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _max = vle32_v_f32m8(ptr_max, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -319,7 +319,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sum = vle32_v_f32m8(ptr_sum, vl); @@ -358,7 +358,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); @@ -392,7 +392,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons while (n) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _maxptr = vle32_v_f32m8(maxptr_vol, vl); vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); @@ -422,7 +422,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons int n = w * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); vfloat32m8_t _sumptr = vle32_v_f32m8(sumptr_vol, vl); @@ -457,7 +457,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_1 = ptr; while (n1 > 0) { - word_type vl = vsetvl_e32m8(n1); + size_t vl = vsetvl_e32m8(n1); vfloat32m8_t _p = vle32_v_f32m8(ptr_1, vl); vfloat32m1_t _scalar_max = vfmv_s_f_f32m1(vundefined_f32m1(), max, vl); _scalar_max = vfredmax_vs_f32m8_f32m1(_scalar_max, _p, _scalar_max, vl); @@ -473,7 +473,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_2 = ptr; while (n2 > 0) { - word_type vl = vsetvl_e32m8(n2); + size_t vl = vsetvl_e32m8(n2); vfloat32m8_t _p = vle32_v_f32m8(ptr_2, vl); vfloat32m1_t _scalar_sum = vfmv_s_f_f32m1(vundefined_f32m1(), sum, vl); @@ -491,7 +491,7 @@ int Softmax_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) cons float* ptr_3 = ptr; while (n3 > 0) { - word_type vl = vsetvl_e32m8(n3); + size_t vl = vsetvl_e32m8(n3); vfloat32m8_t _p = vle32_v_f32m8(ptr_3, vl); _p = vfdiv_vf_f32m8(_p, sum, vl); diff --git a/src/layer/riscv/swish_riscv.cpp b/src/layer/riscv/swish_riscv.cpp index f12ab157ae9..17493d7db69 100644 --- a/src/layer/riscv/swish_riscv.cpp +++ b/src/layer/riscv/swish_riscv.cpp @@ -64,7 +64,7 @@ int Swish_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); @@ -103,7 +103,7 @@ int Swish_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = vfdiv_vv_f32m8(_p, vfadd_vf_f32m8(exp_ps(vfneg_v_f32m8(_p, vl), vl), 1.f, vl), vl); @@ -134,7 +134,7 @@ int Swish_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = vfdiv_vv_f16m8(_p, vfadd_vf_f16m8(exp_ps(vfneg_v_f16m8(_p, vl), vl), 1.f, vl), vl); diff --git a/src/layer/riscv/tanh_riscv.cpp b/src/layer/riscv/tanh_riscv.cpp index b0f0cafe7d7..d47de61dc59 100644 --- a/src/layer/riscv/tanh_riscv.cpp +++ b/src/layer/riscv/tanh_riscv.cpp @@ -64,7 +64,7 @@ int TanH_riscv::forward_inplace(Mat& bottom_top_blob, const Option& opt) const int n = size; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = tanh_ps(_p, vl); @@ -103,7 +103,7 @@ int TanH_riscv::forward_inplace_fp16s(Mat& bottom_top_blob, const Option& opt) c int n = size; while (n > 0) { - word_type vl = vsetvl_e16m4(n); + size_t vl = vsetvl_e16m4(n); vfloat32m8_t _p = vfwcvt_f_f_v_f32m8(vle16_v_f16m4(ptr, vl), vl); _p = tanh_ps(_p, vl); @@ -134,7 +134,7 @@ int TanH_riscv::forward_inplace_fp16sa(Mat& bottom_top_blob, const Option& opt) int n = size; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = tanh_ps(_p, vl); diff --git a/src/layer/riscv/unaryop_riscv.cpp b/src/layer/riscv/unaryop_riscv.cpp index 62c6a52740b..e5eb80151b1 100644 --- a/src/layer/riscv/unaryop_riscv.cpp +++ b/src/layer/riscv/unaryop_riscv.cpp @@ -55,7 +55,7 @@ static int unary_op_inplace(Mat& a, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e32m8(n); + size_t vl = vsetvl_e32m8(n); vfloat32m8_t _p = vle32_v_f32m8(ptr, vl); _p = op(_p, vl); @@ -73,7 +73,7 @@ namespace UnaryOp_riscv_functor { struct unary_op_abs { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfsgnj_vf_f32m8(x, 1.f, vl); } @@ -81,7 +81,7 @@ struct unary_op_abs struct unary_op_neg { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfneg_v_f32m8(x, vl); } @@ -89,7 +89,7 @@ struct unary_op_neg struct unary_op_floor { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); vbool4_t _mask = vmfgt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); @@ -99,7 +99,7 @@ struct unary_op_floor struct unary_op_ceil { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vint32m8_t _xi = vfcvt_x_f_v_i32m8(x, vl); vbool4_t _mask = vmflt_vv_f32m8_b4(vfcvt_f_x_v_f32m8(_xi, vl), x, vl); @@ -109,7 +109,7 @@ struct unary_op_ceil struct unary_op_square { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfmul_vv_f32m8(x, x, vl); } @@ -117,7 +117,7 @@ struct unary_op_square struct unary_op_sqrt { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return vfsqrt_v_f32m8(x, vl); } @@ -125,7 +125,7 @@ struct unary_op_sqrt struct unary_op_rsqrt { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vfloat32m8_t _reciprocal = vfrsqrt7_v_f32m8(x, vl); _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(vfmul_vf_f32m8(x, 0.5f, vl), vfmul_vv_f32m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); @@ -136,7 +136,7 @@ struct unary_op_rsqrt struct unary_op_exp { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return exp_ps(x, vl); } @@ -144,7 +144,7 @@ struct unary_op_exp struct unary_op_log { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return log_ps(x, vl); } @@ -152,7 +152,7 @@ struct unary_op_log struct unary_op_sin { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return sin_ps(x, vl); } @@ -160,7 +160,7 @@ struct unary_op_sin struct unary_op_cos { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return cos_ps(x, vl); } @@ -168,7 +168,7 @@ struct unary_op_cos struct unary_op_tan { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -183,7 +183,7 @@ struct unary_op_tan struct unary_op_asin { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -198,7 +198,7 @@ struct unary_op_asin struct unary_op_acos { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -213,7 +213,7 @@ struct unary_op_acos struct unary_op_atan { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector tmp(vl); @@ -228,7 +228,7 @@ struct unary_op_atan struct unary_op_reciprocal { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { vfloat32m8_t _reciprocal = vfrec7_v_f32m8(x, vl); _reciprocal = vfmul_vv_f32m8(vfrsub_vf_f32m8(vfmul_vv_f32m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); @@ -239,7 +239,7 @@ struct unary_op_reciprocal struct unary_op_tanh { - vfloat32m8_t operator()(const vfloat32m8_t& x, const word_type& vl) const + vfloat32m8_t operator()(const vfloat32m8_t& x, const size_t& vl) const { return tanh_ps(x, vl); } @@ -338,7 +338,7 @@ static int unary_op_inplace_fp16s(Mat& a, const Option& opt) int n = size * elempack; while (n > 0) { - word_type vl = vsetvl_e16m8(n); + size_t vl = vsetvl_e16m8(n); vfloat16m8_t _p = vle16_v_f16m8(ptr, vl); _p = op(_p, vl); @@ -356,7 +356,7 @@ namespace UnaryOp_riscv_functor { struct unary_op_abs_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfsgnj_vf_f16m8(x, 1.f, vl); } @@ -364,7 +364,7 @@ struct unary_op_abs_fp16s struct unary_op_neg_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfneg_v_f16m8(x, vl); } @@ -372,7 +372,7 @@ struct unary_op_neg_fp16s struct unary_op_floor_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); vbool2_t _mask = vmfgt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); @@ -382,7 +382,7 @@ struct unary_op_floor_fp16s struct unary_op_ceil_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vint16m8_t _xi = vfcvt_x_f_v_i16m8(x, vl); vbool2_t _mask = vmflt_vv_f16m8_b2(vfcvt_f_x_v_f16m8(_xi, vl), x, vl); @@ -392,7 +392,7 @@ struct unary_op_ceil_fp16s struct unary_op_square_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfmul_vv_f16m8(x, x, vl); } @@ -400,7 +400,7 @@ struct unary_op_square_fp16s struct unary_op_sqrt_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return vfsqrt_v_f16m8(x, vl); } @@ -408,7 +408,7 @@ struct unary_op_sqrt_fp16s struct unary_op_rsqrt_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vfloat16m8_t _reciprocal = vfrsqrt7_v_f16m8(x, vl); _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(vfmul_vf_f16m8(x, 0.5f, vl), vfmul_vv_f16m8(_reciprocal, _reciprocal, vl), vl), 1.5f, vl), _reciprocal, vl); @@ -419,7 +419,7 @@ struct unary_op_rsqrt_fp16s struct unary_op_exp_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return exp_ps(x, vl); } @@ -427,7 +427,7 @@ struct unary_op_exp_fp16s struct unary_op_log_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return log_ps(x, vl); } @@ -435,7 +435,7 @@ struct unary_op_log_fp16s struct unary_op_sin_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return sin_ps(x, vl); } @@ -443,7 +443,7 @@ struct unary_op_sin_fp16s struct unary_op_cos_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return cos_ps(x, vl); } @@ -451,7 +451,7 @@ struct unary_op_cos_fp16s struct unary_op_tan_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -466,7 +466,7 @@ struct unary_op_tan_fp16s struct unary_op_asin_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -481,7 +481,7 @@ struct unary_op_asin_fp16s struct unary_op_acos_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -496,7 +496,7 @@ struct unary_op_acos_fp16s struct unary_op_atan_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { // TODO rvv optimize std::vector<__fp16> tmp(vl); @@ -511,7 +511,7 @@ struct unary_op_atan_fp16s struct unary_op_reciprocal_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { vfloat16m8_t _reciprocal = vfrec7_v_f16m8(x, vl); _reciprocal = vfmul_vv_f16m8(vfrsub_vf_f16m8(vfmul_vv_f16m8(x, _reciprocal, vl), 2.f, vl), _reciprocal, vl); @@ -522,7 +522,7 @@ struct unary_op_reciprocal_fp16s struct unary_op_tanh_fp16s { - vfloat16m8_t operator()(const vfloat16m8_t& x, const word_type& vl) const + vfloat16m8_t operator()(const vfloat16m8_t& x, const size_t& vl) const { return tanh_ps(x, vl); } diff --git a/src/mat.h b/src/mat.h index 6d7deb502a2..e534def504f 100644 --- a/src/mat.h +++ b/src/mat.h @@ -1071,7 +1071,7 @@ NCNN_FORCEINLINE void Mat::fill(v4f32 _v) NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) { const int packn = cpu_riscv_vlenb() / 4; - const word_type vl = vsetvl_e32m1(packn); + const size_t vl = vsetvl_e32m1(packn); int size = (int)total(); float* ptr = (float*)data; @@ -1085,7 +1085,7 @@ NCNN_FORCEINLINE void Mat::fill(vfloat32m1_t _v) NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = (int)total(); unsigned short* ptr = (unsigned short*)data; @@ -1099,7 +1099,7 @@ NCNN_FORCEINLINE void Mat::fill(vuint16m1_t _v) NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) { const int packn = cpu_riscv_vlenb() / 1; - const word_type vl = vsetvl_e8m1(packn); + const size_t vl = vsetvl_e8m1(packn); int size = (int)total(); signed char* ptr = (signed char*)data; @@ -1113,7 +1113,7 @@ NCNN_FORCEINLINE void Mat::fill(vint8m1_t _v) NCNN_FORCEINLINE void Mat::fill(vfloat16m1_t _v) { const int packn = cpu_riscv_vlenb() / 2; - const word_type vl = vsetvl_e16m1(packn); + const size_t vl = vsetvl_e16m1(packn); int size = (int)total(); __fp16* ptr = (__fp16*)data; diff --git a/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake new file mode 100644 index 00000000000..953f21aaf95 --- /dev/null +++ b/toolchains/riscv64-unknown-linux-gnu.llvm-toolchain.cmake @@ -0,0 +1,29 @@ +set(CMAKE_SYSTEM_NAME Generic) +set(CMAKE_SYSTEM_PROCESSOR riscv64) + +if(DEFINED ENV{RISCV_ROOT_PATH}) + file(TO_CMAKE_PATH $ENV{RISCV_ROOT_PATH} RISCV_ROOT_PATH) +else() + message(FATAL_ERROR "RISCV_ROOT_PATH env must be defined") +endif() + +set(RISCV_ROOT_PATH ${RISCV_ROOT_PATH} CACHE STRING "root path to riscv gnu toolchain") + +set(CMAKE_C_COMPILER "clang") +set(CMAKE_CXX_COMPILER "clang++") +set(CMAKE_SYSROOT "${RISCV_ROOT_PATH}/sysroot") + +set(CMAKE_C_COMPILER_TARGET "riscv64-unknown-linux-gnu") +set(CMAKE_CXX_COMPILER_TARGET "riscv64-unknown-linux-gnu") + +set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# add --ld-path=${RISCV_ROOT_PATH}/bin/riscv64-unknown-linux-gnu-ld or append $RISCV_ROOT_PATH/bin to PATH. +set(CMAKE_C_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc") +set(CMAKE_CXX_FLAGS "--gcc-toolchain=${RISCV_ROOT_PATH} -march=rv64gc") + +# cache flags +set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "c flags") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}" CACHE STRING "c++ flags")