From 466a7fb765e56eeda28dee238bb56770013c5ce2 Mon Sep 17 00:00:00 2001 From: ixgbe00 Date: Wed, 22 Jan 2025 09:51:05 +0800 Subject: [PATCH 1/3] SIMD: add rvv optimization for RISC-V --- .github/workflows/linux_qemu.yml | 115 ++- meson.options | 2 +- meson_cpu/meson.build | 2 +- numpy/_core/meson.build | 18 +- numpy/_core/src/common/simd/intdiv.h | 21 +- numpy/_core/src/common/simd/rvv/arithmetic.h | 410 ++++++++ numpy/_core/src/common/simd/rvv/conversion.h | 141 +++ numpy/_core/src/common/simd/rvv/math.h | 594 +++++++++++ numpy/_core/src/common/simd/rvv/memory.h | 969 +++++++++++++++++ numpy/_core/src/common/simd/rvv/misc.h | 753 ++++++++++++++ numpy/_core/src/common/simd/rvv/operators.h | 944 +++++++++++++++++ numpy/_core/src/common/simd/rvv/reorder.h | 970 ++++++++++++++++++ numpy/_core/src/common/simd/rvv/rvv.h | 86 ++ numpy/_core/src/common/simd/simd.h | 4 + .../src/umath/loops_arithmetic.dispatch.c.src | 4 +- .../umath/loops_hyperbolic.dispatch.cpp.src | 28 + numpy/_core/tests/test_cpu_dispatcher.py | 2 +- numpy/_core/tests/test_cpu_features.py | 9 + 18 files changed, 5056 insertions(+), 16 deletions(-) create mode 100644 numpy/_core/src/common/simd/rvv/arithmetic.h create mode 100644 numpy/_core/src/common/simd/rvv/conversion.h create mode 100644 numpy/_core/src/common/simd/rvv/math.h create mode 100644 numpy/_core/src/common/simd/rvv/memory.h create mode 100644 numpy/_core/src/common/simd/rvv/misc.h create mode 100644 numpy/_core/src/common/simd/rvv/operators.h create mode 100644 numpy/_core/src/common/simd/rvv/reorder.h create mode 100644 numpy/_core/src/common/simd/rvv/rvv.h diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml index 1293e9c37c2f..0f9c81c55d2b 100644 --- a/.github/workflows/linux_qemu.yml +++ b/.github/workflows/linux_qemu.yml @@ -73,14 +73,6 @@ jobs: "(test_kind or test_multiarray or test_simd or test_umath or test_ufunc) and not test_gcd_overflow", "s390x" ] - - [ - "riscv64", - "riscv64-linux-gnu", - "riscv64/ubuntu:22.04", - "-Dallow-noblas=true", - "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", - "riscv64" - ] env: TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }} DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }} @@ -170,7 +162,7 @@ jobs: '" - linux_loongarch64_qemu: + linux_loongarch64_riscv64_qemu: # Only workflow_dispatch is enabled on forks. # To enable this job and subsequent jobs on a fork for other events, comment out: if: github.repository == 'numpy/numpy' || github.event_name == 'workflow_dispatch' @@ -267,3 +259,108 @@ jobs: /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' cd /numpy && spin test -- -k \"${RUNTIME_TEST_FILTER}\" '" + + + linux_riscv64_qemu: + # To enable this workflow on a fork, comment out: + if: github.repository == 'numpy/numpy' + runs-on: ubuntu-24.04 + continue-on-error: true + strategy: + fail-fast: false + matrix: + BUILD_PROP: + - [ + "riscv64", + "riscv64-linux-gnu", + "riscv64/ubuntu:24.04", + "-Dallow-noblas=true", + "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", + "riscv64" + ] + env: + TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }} + DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }} + MESON_OPTIONS: ${{ matrix.BUILD_PROP[3] }} + RUNTIME_TEST_FILTER: ${{ matrix.BUILD_PROP[4] }} + ARCH: ${{ matrix.BUILD_PROP[5] }} + TERM: xterm-256color + + name: "${{ matrix.BUILD_PROP[0] }}" + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: recursive + fetch-tags: true + persist-credentials: false + + - name: Initialize binfmt_misc for qemu-user-static + run: | + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + + - name: Install GCC cross-compilers + run: | + sudo apt update + sudo apt install -y ninja-build gcc-14-${TOOLCHAIN_NAME} g++-14-${TOOLCHAIN_NAME} gfortran-14-${TOOLCHAIN_NAME} + + - name: Cache docker container + uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 + id: container-cache + with: + path: ~/docker_${{ matrix.BUILD_PROP[1] }} + key: container-${{ runner.os }}-${{ matrix.BUILD_PROP[1] }}-${{ matrix.BUILD_PROP[2] }}-${{ hashFiles('requirements/build_requirements.txt') }} + + - name: Creates new container + if: steps.container-cache.outputs.cache-hit != 'true' + run: | + docker run --platform=linux/${ARCH} --name the_container --interactive \ + -v /:/host -v $(pwd):/numpy ${DOCKER_CONTAINER} /bin/bash -c " + apt update && + apt install -y cmake git python3 python-is-python3 python3-dev python3-pip && + mkdir -p /lib64 && ln -s /host/lib64/ld-* /lib64/ && + ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu && + rm -rf /usr/${TOOLCHAIN_NAME} && ln -s /host/usr/${TOOLCHAIN_NAME} /usr/${TOOLCHAIN_NAME} && + rm -rf /usr/lib/gcc/${TOOLCHAIN_NAME} && ln -s /host/usr/lib/gcc-cross/${TOOLCHAIN_NAME} /usr/lib/gcc/${TOOLCHAIN_NAME} && + rm -f /usr/bin/gcc && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gcc-14 /usr/bin/gcc && + rm -f /usr/bin/g++ && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-g++-14 /usr/bin/g++ && + rm -f /usr/bin/gfortran && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gfortran-14 /usr/bin/gfortran && + rm -f /usr/bin/ar && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ar /usr/bin/ar && + rm -f /usr/bin/as && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-as /usr/bin/as && + rm -f /usr/bin/ld && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld /usr/bin/ld && + rm -f /usr/bin/ld.bfd && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld.bfd /usr/bin/ld.bfd && + rm -f /usr/bin/ninja && ln -s /host/usr/bin/ninja /usr/bin/ninja && + git config --global --add safe.directory /numpy && + # No need to build ninja from source, the host ninja is used for the build + grep -v ninja /numpy/requirements/build_requirements.txt > /tmp/build_requirements.txt && + python -m pip install --break-system-packages -r /tmp/build_requirements.txt && + python -m pip install --break-system-packages pytest pytest-xdist hypothesis typing_extensions pytest-timeout && + rm -f /usr/local/bin/ninja && mkdir -p /usr/local/bin && ln -s /host/usr/bin/ninja /usr/local/bin/ninja + " + docker commit the_container the_container + mkdir -p "~/docker_${TOOLCHAIN_NAME}" + docker save -o "~/docker_${TOOLCHAIN_NAME}/the_container.tar" the_container + + - name: Load container from cache + if: steps.container-cache.outputs.cache-hit == 'true' + run: docker load -i "~/docker_${TOOLCHAIN_NAME}/the_container.tar" + + - name: Meson Build + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + cd /numpy && spin build --clean -- ${MESON_OPTIONS} + '" + + - name: Meson Log + if: always() + run: 'cat build/meson-logs/meson-log.txt' + + - name: Run Tests + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + export F90=/usr/bin/gfortran + cd /numpy && spin test -- --timeout=600 --durations=10 -k \"${RUNTIME_TEST_FILTER}\" + '" diff --git a/meson.options b/meson.options index 1be05d324756..7b86854e9882 100644 --- a/meson.options +++ b/meson.options @@ -35,7 +35,7 @@ option('test-simd', type: 'array', 'VSX', 'VSX2', 'VSX3', 'VSX4', 'NEON', 'ASIMD', 'VX', 'VXE', 'VXE2', - 'LSX', + 'LSX','RVV', ], description: 'Specify a list of CPU features to be tested against NumPy SIMD interface') option('test-simd-args', type: 'string', value: '', diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build index e5b6d0fbe7be..5c5cc018a036 100644 --- a/meson_cpu/meson.build +++ b/meson_cpu/meson.build @@ -97,7 +97,7 @@ min_features = { 's390x': [], 'arm': [], 'aarch64': [ASIMD], - 'riscv64': [], + 'riscv64': [RVV], 'wasm32': [], 'loongarch64': [LSX], }.get(cpu_family, []) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index d1c78910b2a3..7b74abcfa578 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -103,6 +103,10 @@ if host_machine.cpu_family() == 'loongarch64' add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp']) endif +if host_machine.cpu_family() == 'riscv64' + add_project_arguments('-march=rv64gcv_zvl128b', '-mrvv-vector-bits=zvl', language: ['c','cpp']) +endif + use_highway = not get_option('disable-highway') if use_highway and not fs.exists('src/highway/README.md') error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.') @@ -750,6 +754,7 @@ _umath_tests_mtargets = mod_features.multi_targets( ASIMDHP, ASIMD, NEON, VSX3, VSX2, VSX, VXE, VX, + RVV, ], baseline: CPU_BASELINE, prefix: 'NPY_', @@ -794,7 +799,8 @@ foreach gen_mtargets : [ AVX512_SKX, AVX2, XOP, SSE42, SSE2, VSX2, ASIMD, NEON, - VXE, VX + VXE, VX, + RVV, ] ], ] @@ -897,6 +903,7 @@ foreach gen_mtargets : [ VSX3, VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -908,6 +915,7 @@ foreach gen_mtargets : [ VSX4, VSX2, VX, LSX, + RVV, ] ], [ @@ -919,6 +927,7 @@ foreach gen_mtargets : [ NEON, VXE, VX, LSX, + RVV, ] ], [ @@ -960,6 +969,7 @@ foreach gen_mtargets : [ VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -978,6 +988,7 @@ foreach gen_mtargets : [ NEON_VFPV4, VXE2, VXE, LSX, + RVV, ] ], [ @@ -994,6 +1005,7 @@ foreach gen_mtargets : [ VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -1005,6 +1017,7 @@ foreach gen_mtargets : [ ASIMD, NEON, VXE, VX, LSX, + RVV, ] ], [ @@ -1015,6 +1028,7 @@ foreach gen_mtargets : [ VSX2, ASIMD, NEON, LSX, + RVV, ] ], [ @@ -1026,6 +1040,7 @@ foreach gen_mtargets : [ VSX3, VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -1037,6 +1052,7 @@ foreach gen_mtargets : [ VSX2, VX, LSX, + RVV, ] ], ] diff --git a/numpy/_core/src/common/simd/intdiv.h b/numpy/_core/src/common/simd/intdiv.h index 0284d49d23bb..1e84159e2e04 100644 --- a/numpy/_core/src/common/simd/intdiv.h +++ b/numpy/_core/src/common/simd/intdiv.h @@ -220,6 +220,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_setall_u8(sh1); divisor.val[2] = npyv_setall_u8(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[0] = npyv_setall_u8(m); + divisor.val[1] = npyv_setall_u8(sh1); + divisor.val[2] = npyv_setall_u8(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -253,7 +257,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) npyv_s8x3 divisor; divisor.val[0] = npyv_setall_s8(m); divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); - #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_LSX) + #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV) divisor.val[1] = npyv_setall_s8(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s8(-sh); @@ -298,6 +302,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u16(sh1); divisor.val[2] = npyv_setall_u16(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u16(sh1); + divisor.val[2] = npyv_setall_u16(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -330,6 +337,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) divisor.val[1] = npyv_setall_s16(-sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s16(sh); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s16(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -370,6 +379,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u32(sh1); divisor.val[2] = npyv_setall_u32(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u32(sh1); + divisor.val[2] = npyv_setall_u32(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -407,6 +419,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) divisor.val[1] = npyv_setall_s32(-sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s32(sh); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s32(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -444,6 +458,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u64(sh1); divisor.val[2] = npyv_setall_u64(sh2); + #elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u64(sh1); + divisor.val[2] = npyv_setall_u64(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -484,6 +501,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) divisor.val[1] = npyv_set_s64(sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s64(sh); + #elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s64(sh); #else #error "please initialize the shifting operand for the new architecture" #endif diff --git a/numpy/_core/src/common/simd/rvv/arithmetic.h b/numpy/_core/src/common/simd/rvv/arithmetic.h new file mode 100644 index 000000000000..a301bc7b258e --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/arithmetic.h @@ -0,0 +1,410 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_ARITHMETIC_H +#define _NPY_SIMD_RVV_ARITHMETIC_H + +#include + +/*************************** + * Addition + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_add_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vadd_vv_u8m1(a, b, 16); } +NPY_FINLINE npyv_s8 npyv_add_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vadd_vv_i8m1(a, b, 16); } +NPY_FINLINE npyv_u16 npyv_add_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vadd_vv_u16m1(a, b, 8); } +NPY_FINLINE npyv_s16 npyv_add_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vadd_vv_i16m1(a, b, 8); } +NPY_FINLINE npyv_u32 npyv_add_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vadd_vv_u32m1(a, b, 4); } +NPY_FINLINE npyv_s32 npyv_add_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vadd_vv_i32m1(a, b, 4); } +NPY_FINLINE npyv_u64 npyv_add_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vadd_vv_u64m1(a, b, 2); } +NPY_FINLINE npyv_s64 npyv_add_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vadd_vv_i64m1(a, b, 2); } +NPY_FINLINE npyv_f32 npyv_add_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfadd_vv_f32m1(a, b, 4); } +NPY_FINLINE npyv_f64 npyv_add_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfadd_vv_f64m1(a, b, 2); } + +// saturated +NPY_FINLINE npyv_u8 npyv_adds_u8(npyv_u8 a, npyv_u8 b) { + return __riscv_vsaddu_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_adds_s8(npyv_s8 a, npyv_s8 b) { + return __riscv_vsadd_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_adds_u16(npyv_u16 a, npyv_u16 b) { + return __riscv_vsaddu_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_adds_s16(npyv_s16 a, npyv_s16 b) { + return __riscv_vsadd_vv_i16m1(a, b, 8); +} + +/*************************** + * Subtraction + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_sub_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsub_vv_u8m1(a, b, 16); } +NPY_FINLINE npyv_s8 npyv_sub_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsub_vv_i8m1(a, b, 16); } +NPY_FINLINE npyv_u16 npyv_sub_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsub_vv_u16m1(a, b, 8); } +NPY_FINLINE npyv_s16 npyv_sub_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsub_vv_i16m1(a, b, 8); } +NPY_FINLINE npyv_u32 npyv_sub_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vsub_vv_u32m1(a, b, 4); } +NPY_FINLINE npyv_s32 npyv_sub_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vsub_vv_i32m1(a, b, 4); } +NPY_FINLINE npyv_u64 npyv_sub_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vsub_vv_u64m1(a, b, 2); } +NPY_FINLINE npyv_s64 npyv_sub_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vsub_vv_i64m1(a, b, 2); } +NPY_FINLINE npyv_f32 npyv_sub_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfsub_vv_f32m1(a, b, 4); } +NPY_FINLINE npyv_f64 npyv_sub_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfsub_vv_f64m1(a, b, 2); } + +// saturated +NPY_FINLINE npyv_u8 npyv_subs_u8(npyv_u8 a, npyv_u8 b) { + return __riscv_vssubu_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_subs_s8(npyv_s8 a, npyv_s8 b) { + return __riscv_vssub_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_subs_u16(npyv_u16 a, npyv_u16 b) { + return __riscv_vssubu_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_subs_s16(npyv_s16 a, npyv_s16 b) { + return __riscv_vssub_vv_i16m1(a, b, 8); +} + +/*************************** + * Multiplication + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_mul_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vmul_vv_u8m1(a, b, 16); } +NPY_FINLINE npyv_s8 npyv_mul_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vmul_vv_i8m1(a, b, 16); } +NPY_FINLINE npyv_u16 npyv_mul_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vmul_vv_u16m1(a, b, 8); } +NPY_FINLINE npyv_s16 npyv_mul_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vmul_vv_i16m1(a, b, 8); } +NPY_FINLINE npyv_u32 npyv_mul_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vmul_vv_u32m1(a, b, 4); } +NPY_FINLINE npyv_s32 npyv_mul_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vmul_vv_i32m1(a, b, 4); } +NPY_FINLINE npyv_f32 npyv_mul_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfmul_vv_f32m1(a, b, 4); } +NPY_FINLINE npyv_f64 npyv_mul_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfmul_vv_f64m1(a, b, 2); } + +/*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_s16 vmull_high_s8(npyv_s8 a, npyv_s8 b) { + vint8mf2_t a_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a, 8, 16)); + vint8mf2_t b_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b, 8, 16)); + return __riscv_vwmul_vv_i16m1(a_high, b_high, 8); +} + +NPY_FINLINE npyv_s32 vmull_high_s16(npyv_s16 a, npyv_s16 b) { + vint16mf2_t a_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a, 4, 8)); + vint16mf2_t b_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b, 4, 8)); + return __riscv_vwmul_vv_i32m1(a_high, b_high, 4); +} + +NPY_FINLINE npyv_s64 vmull_high_s32(npyv_s32 a, npyv_s32 b) { + vint32mf2_t a_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a, 2, 4)); + vint32mf2_t b_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b, 2, 4)); + return __riscv_vwmul_vv_i64m1(a_high, b_high, 2); +} + +NPY_FINLINE npyv_u16 vmull_high_u8(npyv_u8 a, npyv_u8 b) { + vuint8mf2_t a_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a, 8, 16)); + vuint8mf2_t b_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b, 8, 16)); + return __riscv_vwmulu_vv_u16m1(a_high, b_high, 8); +} + +NPY_FINLINE npyv_u32 vmull_high_u16(npyv_u16 a, npyv_u16 b) { + vuint16mf2_t a_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a, 4, 8)); + vuint16mf2_t b_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b, 4, 8)); + return __riscv_vwmulu_vv_u32m1(a_high, b_high, 4); +} + +NPY_FINLINE npyv_u64 vmull_high_u32(npyv_u32 a, npyv_u32 b) { + vuint32mf2_t a_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a, 2, 4)); + vuint32mf2_t b_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b, 2, 4)); + return __riscv_vwmulu_vv_u64m1(a_high, b_high, 2); +} + +NPY_FINLINE npyv_u8 vshlq_u8(npyv_u8 a, npyv_s8 b) { + // implementation only works within defined range 'b' in [0, 7] + vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 16); + npyv_u8 shl = __riscv_vsll_vv_u8m1(a, __riscv_vreinterpret_v_i8m1_u8m1(b), 16); + vuint16m2_t a_ext = __riscv_vzext_vf2_u16m2(a, 16); + npyv_s8 b_neg = __riscv_vneg_v_i8m1(b, 16); + npyv_u8 shr = __riscv_vnclipu_wv_u8m1(a_ext, __riscv_vreinterpret_v_i8m1_u8m1(b_neg), __RISCV_VXRM_RDN, 16); + return __riscv_vmerge_vvm_u8m1(shr, shl, positive_mask, 16); +} + +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + const npyv_u8 mulc_lo = divisor.val[0]; + // high part of unsigned multiplication + npyv_u16 mull_lo = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vwmulu_vv_u16m2(a, mulc_lo, 8)); + npyv_u16 mull_hi = vmull_high_u8(a, divisor.val[0]); + // get the high unsigned bytes + npyv_u8 mulhi = vuzp2q_u8(__riscv_vreinterpret_v_u16m1_u8m1(mull_lo), __riscv_vreinterpret_v_u16m1_u8m1(mull_hi)); + + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + npyv_u8 q = __riscv_vsub_vv_u8m1(a, mulhi, 16); + q = vshlq_u8(q, __riscv_vreinterpret_v_u8m1_i8m1(divisor.val[1])); + q = __riscv_vadd_vv_u8m1(mulhi, q, 16); + q = vshlq_u8(q, __riscv_vreinterpret_v_u8m1_i8m1(divisor.val[2])); + return q; +} + +NPY_FINLINE npyv_s8 vshlq_s8(npyv_s8 a, npyv_s8 b) { + // implementation only works within defined range 'b' in [0, 7] + vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 16); + npyv_s8 shl = __riscv_vsll_vv_i8m1(a, __riscv_vreinterpret_v_i8m1_u8m1(b), 16); + vint16m2_t a_ext = __riscv_vsext_vf2_i16m2(a, 16); + npyv_s8 b_neg = __riscv_vneg_v_i8m1(b, 16); + npyv_s8 shr = __riscv_vnclip_wv_i8m1(a_ext, __riscv_vreinterpret_v_i8m1_u8m1(b_neg), __RISCV_VXRM_RDN, 16); + return __riscv_vmerge_vvm_i8m1(shr, shl, positive_mask, 16); +} + +NPY_FINLINE npyv_s8 vshrq_n_s8(npyv_s8 a, const int b) { + const int imm = b - (b >> 3); + return __riscv_vsra_vx_i8m1(a, imm, 16); +} + +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + const npyv_s8 mulc_lo = divisor.val[0]; + // high part of signed multiplication + npyv_s16 mull_lo = __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vwmul_vv_i16m2(a, mulc_lo, 8)); + npyv_s16 mull_hi = vmull_high_s8(a, divisor.val[0]); + // get the high unsigned bytes + npyv_s8 mulhi = vuzp2q_s8(__riscv_vreinterpret_v_i16m1_i8m1(mull_lo), __riscv_vreinterpret_v_i16m1_i8m1(mull_hi)); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s8 q = vshlq_s8(__riscv_vadd_vv_i8m1(a, mulhi, 16), divisor.val[1]); + q = __riscv_vsub_vv_i8m1(q, vshrq_n_s8(a, 7), 16); + q = __riscv_vsub_vv_i8m1(__riscv_vxor_vv_i8m1(q, divisor.val[2], 16), divisor.val[2], 16); + return q; +} + +NPY_FINLINE npyv_u16 vshlq_u16(npyv_u16 a, npyv_s16 b) { + // implementation only works within defined range 'b' in [0, 15] + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); +} + +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + const npyv_u16 mulc_lo = divisor.val[0]; + // high part of unsigned multiplication + npyv_u32 mull_lo = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vwmulu_vv_u32m2(a, mulc_lo, 4)); + npyv_u32 mull_hi = vmull_high_u16(a, divisor.val[0]); + // get the high unsigned bytes + npyv_u16 mulhi = vuzp2q_u16(__riscv_vreinterpret_v_u32m1_u16m1(mull_lo), __riscv_vreinterpret_v_u32m1_u16m1(mull_hi)); + + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + npyv_u16 q = __riscv_vsub_vv_u16m1(a, mulhi, 8); + q = vshlq_u16(q, __riscv_vreinterpret_v_u16m1_i16m1(divisor.val[1])); + q = __riscv_vadd_vv_u16m1(mulhi, q, 8); + q = vshlq_u16(q, __riscv_vreinterpret_v_u16m1_i16m1(divisor.val[2])); + return q; +} + +NPY_FINLINE npyv_s16 vshrq_n_s16(npyv_s16 a, const int b) { + const int imm = b - (b >> 4); + return __riscv_vsra_vx_i16m1(a, imm, 8); +} + +NPY_FINLINE npyv_s16 vshlq_s16(npyv_s16 a, npyv_s16 b) { + // implementation only works within defined range 'b' in [0, 15] + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); +} + +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + const npyv_s16 mulc_lo = divisor.val[0]; + // high part of signed multiplication + npyv_s32 mull_lo = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(a, mulc_lo, 4)); + npyv_s32 mull_hi = vmull_high_s16(a, divisor.val[0]); + // get the high unsigned bytes + npyv_s16 mulhi = vuzp2q_s16(__riscv_vreinterpret_v_i32m1_i16m1(mull_lo), __riscv_vreinterpret_v_i32m1_i16m1(mull_hi)); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s16 q = vshlq_s16(__riscv_vadd_vv_i16m1(a, mulhi, 8), divisor.val[1]); + q = __riscv_vsub_vv_i16m1(q, vshrq_n_s16(a, 15), 8); + q = __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(q, divisor.val[2], 8), divisor.val[2], 8); + return q; +} + +NPY_FINLINE npyv_u32 vshlq_u32(npyv_u32 a, npyv_s32 b) { + // implementation only works within defined range 'b' in [0, 31] + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); +} + +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + const npyv_u32 mulc_lo = divisor.val[0]; + // high part of unsigned multiplication + npyv_u64 mull_lo = __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vwmulu_vv_u64m2(a, mulc_lo, 2)); + npyv_u64 mull_hi = vmull_high_u32(a, divisor.val[0]); + // get the high unsigned bytes + npyv_u32 mulhi = vuzp2q_u32(__riscv_vreinterpret_v_u64m1_u32m1(mull_lo), __riscv_vreinterpret_v_u64m1_u32m1(mull_hi)); + + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + npyv_u32 q = __riscv_vsub_vv_u32m1(a, mulhi, 4); + q = vshlq_u32(q, __riscv_vreinterpret_v_u32m1_i32m1(divisor.val[1])); + q = __riscv_vadd_vv_u32m1(mulhi, q, 4); + q = vshlq_u32(q, __riscv_vreinterpret_v_u32m1_i32m1(divisor.val[2])); + return q; +} + +NPY_FINLINE npyv_s32 vshrq_n_s32(npyv_s32 a, const int b) { + const int imm = b - (b >> 5); + return __riscv_vsra_vx_i32m1(a, imm, 4); +} + +NPY_FINLINE npyv_s32 vshlq_s32(npyv_s32 a, npyv_s32 b) { + // implementation only works within defined range 'b' in [0, 31] + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); +} + +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + const npyv_s32 mulc_lo = divisor.val[0]; + // high part of signed multiplication + npyv_s64 mull_lo = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(a, mulc_lo, 2)); + npyv_s64 mull_hi = vmull_high_s32(a, divisor.val[0]); + // get the high unsigned bytes + npyv_s32 mulhi = vuzp2q_s32(__riscv_vreinterpret_v_i64m1_i32m1(mull_lo), __riscv_vreinterpret_v_i64m1_i32m1(mull_hi)); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + npyv_s32 q = vshlq_s32(__riscv_vadd_vv_i32m1(a, mulhi, 4), divisor.val[1]); + q = __riscv_vsub_vv_i32m1(q, vshrq_n_s32(a, 31), 4); + q = __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(q, divisor.val[2], 4), divisor.val[2], 4); + return q; +} + +NPY_FINLINE uint64_t vgetq_lane_u64(npyv_u64 a, const int b) { + return __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, b, 2)); +} + +NPY_FINLINE int64_t vgetq_lane_s64(npyv_s64 a, const int b) { + return __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(a, b, 2)); +} + +// divide each unsigned 64-bit element by a divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + const uint64_t d = vgetq_lane_u64(divisor.val[0], 0); + return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d); +} + +// returns the high 64 bits of signed 64-bit multiplication +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + const int64_t d = vgetq_lane_s64(divisor.val[0], 0); + return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d); +} + +/*************************** + * Division + ***************************/ +NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfdiv_vv_f32m1(a, b, 4); } +NPY_FINLINE npyv_f64 npyv_div_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfdiv_vv_f64m1(a, b, 2); } + +/*************************** + * FUSED F32 + ***************************/ +// multiply and add, a*b + c +NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfmacc_vv_f32m1(c, a, b, 4); } + +// multiply and subtract, a*b - c +NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfmacc_vv_f32m1(__riscv_vfneg_v_f32m1(c, 4), a, b, 4); } + +// negate multiply and add, -(a*b) + c +NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfnmsac_vv_f32m1(c, a, b, 4); } + +// negate multiply and subtract, -(a*b) - c +NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfnmsac_vv_f32m1(__riscv_vfneg_v_f32m1(c, 4), a, b, 4); } + +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ + const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); + return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); +} + +/*************************** + * FUSED F64 + ***************************/ +NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfmacc_vv_f64m1(c, a, b, 2); } + +NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfmacc_vv_f64m1(__riscv_vfneg_v_f64m1(c, 2), a, b, 2); } + +NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfnmsac_vv_f64m1(c, a, b, 2); } + +NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfnmsac_vv_f64m1(__riscv_vfneg_v_f64m1(c, 2), a, b, 2); } + +NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ + const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); + return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); +} + +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)); +} + +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { + return __riscv_vmv_x_s_u64m1_u64(__riscv_vredsum_vs_u64m1_u64m1(a, __riscv_vmv_v_x_u64m1(0, 2), 2)); +} + +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { + return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredosum_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(0, 4), 4)); +} + +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { + return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredosum_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(0, 2), 2)); +} + +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { + return __riscv_vmv_x_s_u16m1_u16(__riscv_vwredsumu_vs_u8m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 16)); +} + +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { + return __riscv_vmv_x_s_u32m1_u32(__riscv_vwredsumu_vs_u16m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 8)); +} + +#endif // _NPY_SIMD_RVV_ARITHMETIC_H \ No newline at end of file diff --git a/numpy/_core/src/common/simd/rvv/conversion.h b/numpy/_core/src/common/simd/rvv/conversion.h new file mode 100644 index 000000000000..1e937920c4e7 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/conversion.h @@ -0,0 +1,141 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_CVT_H +#define _NPY_SIMD_RVV_CVT_H + +#define npyv_cvt_u8_b8(A) A +#define npyv_cvt_s8_b8(A) __riscv_vreinterpret_v_u8m1_i8m1(A) +#define npyv_cvt_u16_b16(A) A +#define npyv_cvt_s16_b16(A) __riscv_vreinterpret_v_u16m1_i16m1(A) +#define npyv_cvt_u32_b32(A) A +#define npyv_cvt_s32_b32(A) __riscv_vreinterpret_v_u32m1_i32m1(A) +#define npyv_cvt_u64_b64(A) A +#define npyv_cvt_s64_b64(A) __riscv_vreinterpret_v_u64m1_i64m1(A) +#define npyv_cvt_f32_b32(A) __riscv_vreinterpret_v_u32m1_f32m1(A) +#define npyv_cvt_f64_b64(A) __riscv_vreinterpret_v_u64m1_f64m1(A) + +#define npyv_cvt_b8_u8(A) A +#define npyv_cvt_b8_s8(A) __riscv_vreinterpret_v_i8m1_u8m1(A) +#define npyv_cvt_b16_u16(A) A +#define npyv_cvt_b16_s16(A) __riscv_vreinterpret_v_i16m1_u16m1(A) +#define npyv_cvt_b32_u32(A) A +#define npyv_cvt_b32_s32(A) __riscv_vreinterpret_v_i32m1_u32m1(A) +#define npyv_cvt_b64_u64(A) A +#define npyv_cvt_b64_s64(A) __riscv_vreinterpret_v_i64m1_u64m1(A) +#define npyv_cvt_b32_f32(A) __riscv_vreinterpret_v_f32m1_u32m1(A) +#define npyv_cvt_b64_f64(A) __riscv_vreinterpret_v_f64m1_u64m1(A) + +NPY_FINLINE npyv_u8 vqtbl1q_u8(npyv_u8 t, npyv_u8 idx) { + vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(idx, 16, 16); + return __riscv_vmerge_vxm_u8m1(__riscv_vrgather_vv_u8m1(t, idx, 16), 0, mask, 16); +} + +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ + const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); + npyv_u8 seq_scale = __riscv_vand_vv_u8m1(a, scale, 16); + const npyv_u8 byteOrder = npyv_set_u8(0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); + npyv_u8 v0 = vqtbl1q_u8(seq_scale, byteOrder); + return __riscv_vmv_x_s_u32m1_u32(__riscv_vwredsumu_vs_u16m1_u32m1(__riscv_vreinterpret_v_u8m1_u16m1(v0), __riscv_vmv_v_x_u32m1(0, 4), 8)); +} + +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ + const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128); + npyv_u16 seq_scale = __riscv_vand_vv_u16m1(a, scale, 8); + return __riscv_vmv_x_s_u16m1_u16(__riscv_vredsum_vs_u16m1_u16m1(seq_scale, __riscv_vmv_v_x_u16m1(0, 8), 8)); +} + +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ + const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8); + npyv_u32 seq_scale = __riscv_vand_vv_u32m1(a, scale, 4); + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m1_u32m1(seq_scale, __riscv_vmv_v_x_u32m1(0, 4), 4)); +} + +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + uint64_t first = __riscv_vmv_x_s_u64m1_u64(a); + uint64_t second = __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, 1, vlen)); + return ((second & 0x2) | (first & 0x1)); +} + +//expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { + npyv_u16x2 r; + r.val[0] = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(data, 8)); + r.val[1] = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(__riscv_vslidedown_vx_u8m1(data, 8, 16), 8)); + return r; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { + npyv_u32x2 r; + r.val[0] = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(data, 4)); + r.val[1] = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(__riscv_vslidedown_vx_u16m1(data, 4, 8), 4)); + return r; +} + +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { + npyv_b8 a8 = __riscv_vreinterpret_v_u16m1_u8m1(a); + npyv_b8 b8 = __riscv_vreinterpret_v_u16m1_u8m1(b); + return vuzp1q_u8(a8, b8); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { + npyv_b16 a16 = __riscv_vreinterpret_v_u32m1_u16m1(a); + npyv_b16 b16 = __riscv_vreinterpret_v_u32m1_u16m1(b); + npyv_b16 c16 = __riscv_vreinterpret_v_u32m1_u16m1(c); + npyv_b16 d16 = __riscv_vreinterpret_v_u32m1_u16m1(d); + + npyv_b16 ab = vuzp1q_u16(a16, b16); + npyv_b16 cd = vuzp1q_u16(c16, d16); + + return npyv_pack_b8_b16(ab, cd); +} + + // pack eight 64-bit boolean vectors into one 8-bit boolean vector + NPY_FINLINE npyv_b8 + npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { + npyv_b32 a32 = __riscv_vreinterpret_v_u64m1_u32m1(a); + npyv_b32 b32 = __riscv_vreinterpret_v_u64m1_u32m1(b); + npyv_b32 c32 = __riscv_vreinterpret_v_u64m1_u32m1(c); + npyv_b32 d32 = __riscv_vreinterpret_v_u64m1_u32m1(d); + npyv_b32 e32 = __riscv_vreinterpret_v_u64m1_u32m1(e); + npyv_b32 f32 = __riscv_vreinterpret_v_u64m1_u32m1(f); + npyv_b32 g32 = __riscv_vreinterpret_v_u64m1_u32m1(g); + npyv_b32 h32 = __riscv_vreinterpret_v_u64m1_u32m1(h); + + npyv_b32 ab = vuzp1q_u32(a32, b32); + npyv_b32 cd = vuzp1q_u32(c32, d32); + npyv_b32 ef = vuzp1q_u32(e32, f32); + npyv_u32 gh = vuzp1q_u32(g32, h32); + + return npyv_pack_b8_b32(ab, cd, ef, gh); + } + +// round to nearest integer +NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + // (round-to-nearest-even) + return __riscv_vfcvt_x_f_v_i32m1(a, vlen); +} + +NPY_FINLINE npyv_s32 vmovn_s64(npyv_s64 a) { + return __riscv_vnsra_wx_i32m1(__riscv_vlmul_ext_v_i64m1_i64m2(a), 0, 2); +} + +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + npyv_s64 lo = __riscv_vfcvt_x_f_v_i64m1(a, 2), hi = __riscv_vfcvt_x_f_v_i64m1(b, 2); + return __riscv_vslideup_vx_i32m1(vmovn_s64(lo), vmovn_s64(hi), 2, 4); +} + +#endif // _NPY_SIMD_RVV_CVT_H diff --git a/numpy/_core/src/common/simd/rvv/math.h b/numpy/_core/src/common/simd/rvv/math.h new file mode 100644 index 000000000000..9b3b34a200dc --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/math.h @@ -0,0 +1,594 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MATH_H +#define _NPY_SIMD_RVV_MATH_H + +#include + +/*************************** + * Elementary + ***************************/ +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfabs_v_f32m1(a, vlen); +} + +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfabs_v_f64m1(a, vlen); +} + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmul_vv_f32m1(a, a, vlen); +} + +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmul_vv_f64m1(a, a, vlen); +} + +// Square root +NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfsqrt_v_f32m1(a, vlen); +} + +NPY_FINLINE npyv_f64 npyv_sqrt_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfsqrt_v_f64m1(a, vlen); +} + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + vfloat32m1_t one = __riscv_vfmv_v_f_f32m1(1.0f, vlen); + return __riscv_vfdiv_vv_f32m1(one, a, vlen); +} + +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + npyv_f64 one = __riscv_vfmv_v_f_f64m1(1.0, vlen); + return __riscv_vfdiv_vv_f64m1(one, a, vlen); +} + +// Maximum +NPY_FINLINE npyv_f32 npyv_max_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmax_vv_f32m1(a, b, vlen); +} + +NPY_FINLINE npyv_f64 npyv_max_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmax_vv_f64m1(a, b, vlen); +} + +// Maximum +NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(b, b, vlen); + + vfloat32m1_t sel_a = __riscv_vmerge_vvm_f32m1(b, a, not_nan_a, vlen); + vfloat32m1_t sel_b = __riscv_vmerge_vvm_f32m1(a, b, not_nan_b, vlen); + + return __riscv_vfmax_vv_f32m1(sel_a, sel_b, vlen); +} + + +// Max, propagates NaNs +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmax_vv_f32m1(a, b, vlen); +} + +// Max, NaN-propagating +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmax_vv_f64m1(a, b, vlen); +} + +// Max, NaN-suppressing +NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + + vbool64_t a_is_nan = __riscv_vmfne_vv_f64m1_b64(a, a, vlen); + vbool64_t b_is_nan = __riscv_vmfne_vv_f64m1_b64(b, b, vlen); + + vfloat64m1_t max = __riscv_vfmax_vv_f64m1(a, b, vlen); + max = __riscv_vmerge_vvm_f64m1(max, b, a_is_nan, vlen); + max = __riscv_vmerge_vvm_f64m1(max, a, b_is_nan, vlen); + + return max; +} + +// Maximum, integer operations +NPY_FINLINE npyv_u8 npyv_max_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vmaxu_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_max_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vmax_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_max_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vmaxu_vv_u16m1(a, b, vlen); +} + +NPY_FINLINE npyv_s16 npyv_max_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vmax_vv_i16m1(a, b, vlen); +} + +NPY_FINLINE npyv_u32 npyv_max_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vmaxu_vv_u32m1(a, b, vlen); +} + +NPY_FINLINE npyv_s32 npyv_max_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vmax_vv_i32m1(a, b, vlen); +} + +NPY_FINLINE npyv_u64 npyv_max_u64(npyv_u64 a, npyv_u64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + vbool64_t mask = __riscv_vmsgtu_vv_u64m1_b64(a, b, vlen); + return __riscv_vmerge_vvm_u64m1(b, a, mask, vlen); +} + +NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + vbool64_t mask = __riscv_vmsgt_vv_i64m1_b64(a, b, vlen); + return __riscv_vmerge_vvm_i64m1(b, a, mask, vlen); +} + +// Minimum, natively mapping with no guarantees to handle NaN. +NPY_FINLINE npyv_f32 npyv_min_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmin_vv_f32m1(a, b, vlen); +} + +NPY_FINLINE npyv_f64 npyv_min_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmin_vv_f64m1(a, b, vlen); +} + +// Minimum +NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + + vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); + vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(b, b, vlen); + + vfloat32m1_t sel_a = __riscv_vmerge_vvm_f32m1(b, a, not_nan_a, vlen); + vfloat32m1_t sel_b = __riscv_vmerge_vvm_f32m1(a, b, not_nan_b, vlen); + + return __riscv_vfmin_vv_f32m1(sel_a, sel_b, vlen); +} + +// Min, propagates NaNs +// If any of corresponded element is NaN, NaN is set. +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmin_vv_f32m1(a, b, vlen); +} + +// Min, NaN-propagating +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmin_vv_f64m1(a, b, vlen); +} + +// Min, NaN-suppressing +NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + + vbool64_t a_is_nan = __riscv_vmfne_vv_f64m1_b64(a, a, vlen); + vbool64_t b_is_nan = __riscv_vmfne_vv_f64m1_b64(b, b, vlen); + + npyv_f64 min = __riscv_vfmin_vv_f64m1(a, b, vlen); + min = __riscv_vmerge_vvm_f64m1(min, b, a_is_nan, vlen); + min = __riscv_vmerge_vvm_f64m1(min, a, b_is_nan, vlen); + + return min; +} + +// Minimum, integer operations +NPY_FINLINE npyv_u8 npyv_min_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vminu_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_min_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vmin_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_min_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vminu_vv_u16m1(a, b, vlen); +} + +NPY_FINLINE npyv_s16 npyv_min_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vmin_vv_i16m1(a, b, vlen); +} + +NPY_FINLINE npyv_u32 npyv_min_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vminu_vv_u32m1(a, b, vlen); +} + +NPY_FINLINE npyv_s32 npyv_min_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vmin_vv_i32m1(a, b, vlen); +} + +NPY_FINLINE npyv_u64 npyv_min_u64(npyv_u64 a, npyv_u64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + vbool64_t mask = __riscv_vmsltu_vv_u64m1_b64(a, b, vlen); + return __riscv_vmerge_vvm_u64m1(b, a, mask, vlen); +} + +NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + vbool64_t mask = __riscv_vmslt_vv_i64m1_b64(a, b, vlen); + return __riscv_vmerge_vvm_i64m1(b, a, mask, vlen); +} + +// reduce min/max for all data types +// Maximum reductions +NPY_FINLINE uint8_t npyv_reduce_max_u8(npyv_u8 a) +{ + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(0, 16), 16)); +} + +NPY_FINLINE int8_t npyv_reduce_max_s8(npyv_s8 a) +{ + return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmax_vs_i8m1_i8m1(a, __riscv_vmv_v_x_i8m1(INT8_MIN, 16), 16)); +} + +NPY_FINLINE uint16_t npyv_reduce_max_u16(npyv_u16 a) +{ + return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 8)); +} + +NPY_FINLINE int16_t npyv_reduce_max_s16(npyv_s16 a) +{ + return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmax_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(INT16_MIN, 8), 8)); +} + +NPY_FINLINE uint32_t npyv_reduce_max_u32(npyv_u32 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)); +} + +NPY_FINLINE int32_t npyv_reduce_max_s32(npyv_s32 a) +{ + return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmax_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(INT32_MIN, 4), 4)); +} + +// Floating-point maximum reductions +NPY_FINLINE float npyv_reduce_max_f32(npyv_f32 a) +{ + uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b32_u8m1(__riscv_vmfeq_vv_f32m1_b32(a, a, 4))); + if ((mask & 0b1111) != 0b1111) { + return NAN; + } + return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(-FLT_MAX, 4), 4)); +} + +NPY_FINLINE double npyv_reduce_max_f64(npyv_f64 a) +{ + uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b64_u8m1(__riscv_vmfeq_vv_f64m1_b64(a, a, 2))); + if ((mask & 0b11) != 0b11) { + return NAN; + } + return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(-DBL_MAX, 2), 2)); +} + +// NaN-propagating maximum reductions +#define npyv_reduce_maxn_f32 npyv_reduce_max_f32 +#define npyv_reduce_maxn_f64 npyv_reduce_max_f64 + +// NaN-suppressing maximum reductions +NPY_FINLINE float npyv_reduce_maxp_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + + vbool32_t valid_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); + + vbool32_t nan_mask = __riscv_vmnot_m_b32(valid_mask, vlen); + + npyv_f32 masked_a = __riscv_vfmerge_vfm_f32m1( + a, + -INFINITY, + nan_mask, + vlen + ); + + return __riscv_vfmv_f_s_f32m1_f32( + __riscv_vfredmax_vs_f32m1_f32m1( + masked_a, + __riscv_vfmv_v_f_f32m1(-INFINITY, vlen), + vlen + ) + ); +} + +NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + + vbool64_t valid_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, vlen); + + vbool64_t nan_mask = __riscv_vmnot_m_b64(valid_mask, vlen); + + npyv_f64 masked_a = __riscv_vfmerge_vfm_f64m1( + a, + -INFINITY, + nan_mask, + vlen + ); + + return __riscv_vfmv_f_s_f64m1_f64( + __riscv_vfredmax_vs_f64m1_f64m1( + masked_a, + __riscv_vfmv_v_f_f64m1(-INFINITY, vlen), + vlen + ) + ); +} + +// Minimum reductions +NPY_FINLINE uint8_t npyv_reduce_min_u8(npyv_u8 a) +{ + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)); +} + +NPY_FINLINE int8_t npyv_reduce_min_s8(npyv_s8 a) +{ + return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmin_vs_i8m1_i8m1(a, __riscv_vmv_v_x_i8m1(INT8_MAX, 16), 16)); +} + +NPY_FINLINE uint16_t npyv_reduce_min_u16(npyv_u16 a) +{ + return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), 8)); +} + +NPY_FINLINE int16_t npyv_reduce_min_s16(npyv_s16 a) +{ + return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmin_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(INT16_MAX, 8), 8)); +} + +NPY_FINLINE uint32_t npyv_reduce_min_u32(npyv_u32 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)); +} + +NPY_FINLINE int32_t npyv_reduce_min_s32(npyv_s32 a) +{ + return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmin_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(INT32_MAX, 4), 4)); +} + +// Floating-point minimum reductions +NPY_FINLINE float npyv_reduce_min_f32(npyv_f32 a) +{ + uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b32_u8m1(__riscv_vmfeq_vv_f32m1_b32(a, a, 4))); + if ((mask & 0b1111) != 0b1111) { + return NAN; + } + return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(FLT_MAX, 4), 4)); +} + +NPY_FINLINE double npyv_reduce_min_f64(npyv_f64 a) +{ + uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b64_u8m1(__riscv_vmfeq_vv_f64m1_b64(a, a, 2))); + if ((mask & 0b11) != 0b11) { + return NAN; + } + return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(DBL_MAX, 2), 2)); +} + +// NaN-propagating minimum reductions +#define npyv_reduce_minn_f32 npyv_reduce_min_f32 +#define npyv_reduce_minn_f64 npyv_reduce_min_f64 + +// NaN-suppressing minimum reductions +NPY_FINLINE float npyv_reduce_minp_f32(npyv_f32 a) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + + vbool32_t valid_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); + + vbool32_t nan_mask = __riscv_vmnot_m_b32(valid_mask, vlen); + + npyv_f32 masked_a = __riscv_vfmerge_vfm_f32m1( + a, + INFINITY, + nan_mask, + vlen + ); + + return __riscv_vfmv_f_s_f32m1_f32( + __riscv_vfredmin_vs_f32m1_f32m1( + masked_a, + __riscv_vfmv_v_f_f32m1(INFINITY, vlen), + vlen + ) + ); +} + +NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + + vbool64_t valid_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, vlen); + + vbool64_t nan_mask = __riscv_vmnot_m_b64(valid_mask, vlen); + + npyv_f64 masked_a = __riscv_vfmerge_vfm_f64m1( + a, + INFINITY, + nan_mask, + vlen + ); + + return __riscv_vfmv_f_s_f64m1_f64( + __riscv_vfredmin_vs_f64m1_f64m1( + masked_a, + __riscv_vfmv_v_f_f64m1(INFINITY, vlen), + vlen + ) + ); +} + +// Maximum reductions for 64-bit integers +NPY_FINLINE npy_uint64 npyv_reduce_max_u64(npyv_u64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_x_s_u64m1_u64( + __riscv_vredmax_vs_u64m1_u64m1( + a, + __riscv_vmv_v_x_u64m1(0, vlen), + vlen + ) + ); +} + +NPY_FINLINE npy_int64 npyv_reduce_max_s64(npyv_s64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vredmax_vs_i64m1_i64m1( + a, + __riscv_vmv_v_x_i64m1(INT64_MIN, vlen), + vlen + ) + ); +} + +NPY_FINLINE npy_uint64 npyv_reduce_min_u64(npyv_u64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_x_s_u64m1_u64( + __riscv_vredmin_vs_u64m1_u64m1( + a, + __riscv_vmv_v_x_u64m1(UINT64_MAX, vlen), + vlen + ) + ); +} + +NPY_FINLINE npy_int64 npyv_reduce_min_s64(npyv_s64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_x_s_i64m1_i64( + __riscv_vredmin_vs_i64m1_i64m1( + a, + __riscv_vmv_v_x_i64m1(INT64_MAX, vlen), + vlen + ) + ); +} + +// round to nearest integer even +NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) +{ + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RNE, 4), 4); +} + +NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfcvt_f_x_v_f64m1( + __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RNE, vlen), + vlen + ); +} + +// ceil +NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) +{ + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RUP, 4), 4); +} + +NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfcvt_f_x_v_f64m1( + __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RUP, vlen), + vlen + ); +} + +// trunc +NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) +{ + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RTZ, 4), 4); +} + +NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfcvt_f_x_v_f64m1( + __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RTZ, vlen), + vlen + ); +} + +// floor +NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) +{ + return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RDN, 4), 4); +} + +NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) +{ + size_t vl = __riscv_vsetvlmax_e64m1(); + return __riscv_vfcvt_f_x_v_f64m1( + __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RDN, vl), + vl + ); +} + +#endif // _NPY_SIMD_RVV_MATH_H diff --git a/numpy/_core/src/common/simd/rvv/memory.h b/numpy/_core/src/common/simd/rvv/memory.h new file mode 100644 index 000000000000..5c847b115aad --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/memory.h @@ -0,0 +1,969 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MEMORY_H +#define _NPY_SIMD_RVV_MEMORY_H + +#include "misc.h" + +/*************************** + * load/store + ***************************/ +// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors +// uint8_t +NPY_FINLINE npyv_u8 npyv_load_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } + +NPY_FINLINE npyv_u8 npyv_loada_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } + +NPY_FINLINE npyv_u8 npyv_loads_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } + +NPY_FINLINE npyv_u8 npyv_loadl_u8(const npyv_lanetype_u8 *ptr) +{ + return __riscv_vslideup_vx_u8m1(__riscv_vle8_v_u8m1((const uint8_t*)ptr, 8), __riscv_vmv_v_x_u8m1(0, 8), 8, 16); +} + +NPY_FINLINE void npyv_store_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_storea_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_stores_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_storel_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storeh_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, __riscv_vslidedown_vx_u8m1(vec, 8, 16), 8); } + + +// int8_t +NPY_FINLINE npyv_s8 npyv_load_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } + +NPY_FINLINE npyv_s8 npyv_loada_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } + +NPY_FINLINE npyv_s8 npyv_loads_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } + +NPY_FINLINE npyv_s8 npyv_loadl_s8(const npyv_lanetype_s8 *ptr) +{ + return __riscv_vslideup_vx_i8m1(__riscv_vle8_v_i8m1((const int8_t*)ptr, 8), __riscv_vmv_v_x_i8m1(0, 8), 8, 16); +} + +NPY_FINLINE void npyv_store_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_storea_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_stores_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } + +NPY_FINLINE void npyv_storel_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storeh_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, __riscv_vslidedown_vx_i8m1(vec, 8, 16), 8); } + +// uint16_t +NPY_FINLINE npyv_u16 npyv_load_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } + +NPY_FINLINE npyv_u16 npyv_loada_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } + +NPY_FINLINE npyv_u16 npyv_loads_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } + +NPY_FINLINE npyv_u16 npyv_loadl_u16(const npyv_lanetype_u16 *ptr) +{ + return __riscv_vslideup_vx_u16m1( + __riscv_vle16_v_u16m1((const uint16_t*)ptr, 4), __riscv_vmv_v_x_u16m1(0, 4), 4, 8 + ); +} + +NPY_FINLINE void npyv_store_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storea_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_stores_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storel_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storeh_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, __riscv_vslidedown_vx_u16m1(vec, 4, 8), 4); } + +// int16_t +NPY_FINLINE npyv_s16 npyv_load_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } + +NPY_FINLINE npyv_s16 npyv_loada_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } + +NPY_FINLINE npyv_s16 npyv_loads_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } + +NPY_FINLINE npyv_s16 npyv_loadl_s16(const npyv_lanetype_s16 *ptr) +{ + return __riscv_vslideup_vx_i16m1( + __riscv_vle16_v_i16m1((const int16_t*)ptr, 4), __riscv_vmv_v_x_i16m1(0, 4), 4, 8 + ); +} + +NPY_FINLINE void npyv_store_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storea_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_stores_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } + +NPY_FINLINE void npyv_storel_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storeh_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, __riscv_vslidedown_vx_i16m1(vec, 4, 8), 4); } + +// uint32_t +NPY_FINLINE npyv_u32 npyv_load_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } + +NPY_FINLINE npyv_u32 npyv_loada_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } + +NPY_FINLINE npyv_u32 npyv_loads_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } + +NPY_FINLINE npyv_u32 npyv_loadl_u32(const npyv_lanetype_u32 *ptr) +{ + return __riscv_vslideup_vx_u32m1( + __riscv_vle32_v_u32m1((const uint32_t*)ptr, 2), __riscv_vmv_v_x_u32m1(0, 2), 2, 4 + ); +} + +NPY_FINLINE void npyv_store_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storea_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_stores_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storel_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storeh_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, __riscv_vslidedown_vx_u32m1(vec, 2, 4), 2); } + +// int32_t +NPY_FINLINE npyv_s32 npyv_load_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } + +NPY_FINLINE npyv_s32 npyv_loada_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } + +NPY_FINLINE npyv_s32 npyv_loads_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } + +NPY_FINLINE npyv_s32 npyv_loadl_s32(const npyv_lanetype_s32 *ptr) +{ + return __riscv_vslideup_vx_i32m1( + __riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(0, 2), 2, 4 + ); +} + +NPY_FINLINE void npyv_store_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storea_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_stores_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storel_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storeh_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, __riscv_vslidedown_vx_i32m1(vec, 2, 4), 2); } + +// uint64_t +NPY_FINLINE npyv_u64 npyv_load_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } + +NPY_FINLINE npyv_u64 npyv_loada_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } + +NPY_FINLINE npyv_u64 npyv_loads_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } + +NPY_FINLINE npyv_u64 npyv_loadl_u64(const npyv_lanetype_u64 *ptr) +{ + return __riscv_vslideup_vx_u64m1( + __riscv_vle64_v_u64m1((const uint64_t*)ptr, 1), __riscv_vmv_v_x_u64m1(0, 1), 1, 2 + ); +} + +NPY_FINLINE void npyv_store_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storea_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_stores_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storel_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 1); } + +NPY_FINLINE void npyv_storeh_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, __riscv_vslidedown_vx_u64m1(vec, 1, 2), 1); } + +// int64_t +NPY_FINLINE npyv_s64 npyv_load_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } + +NPY_FINLINE npyv_s64 npyv_loada_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } + +NPY_FINLINE npyv_s64 npyv_loads_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } + +NPY_FINLINE npyv_s64 npyv_loadl_s64(const npyv_lanetype_s64 *ptr) +{ + return __riscv_vslideup_vx_i64m1( + __riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vmv_v_x_i64m1(0, 1), 1, 2 + ); +} + +NPY_FINLINE void npyv_store_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storea_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_stores_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storel_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 1); } + +NPY_FINLINE void npyv_storeh_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, __riscv_vslidedown_vx_i64m1(vec, 1, 2), 1); } + +// float +NPY_FINLINE npyv_f32 npyv_load_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } + +NPY_FINLINE npyv_f32 npyv_loada_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } + +NPY_FINLINE npyv_f32 npyv_loads_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } + +NPY_FINLINE npyv_f32 npyv_loadl_f32(const npyv_lanetype_f32 *ptr) +{ + return __riscv_vslideup_vx_f32m1( + __riscv_vle32_v_f32m1((const float*)ptr, 2), __riscv_vfmv_v_f_f32m1(0, 2), 2, 4 + ); +} + +NPY_FINLINE void npyv_store_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storea_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } + +NPY_FINLINE void npyv_stores_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } + +NPY_FINLINE void npyv_storel_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storeh_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, __riscv_vslidedown_vx_f32m1(vec, 2, 4), 2); } + + +// double +NPY_FINLINE npyv_f64 npyv_load_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } + +NPY_FINLINE npyv_f64 npyv_loada_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } + +NPY_FINLINE npyv_f64 npyv_loads_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } + +NPY_FINLINE npyv_f64 npyv_loadl_f64(const npyv_lanetype_f64 *ptr) +{ + return __riscv_vslideup_vx_f64m1( + __riscv_vle64_v_f64m1((const double*)ptr, 1), __riscv_vfmv_v_f_f64m1(0, 1), 1, 2 + ); +} + +NPY_FINLINE void npyv_store_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storea_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } + +NPY_FINLINE void npyv_stores_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } + +NPY_FINLINE void npyv_storel_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, 1); } + +NPY_FINLINE void npyv_storeh_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, __riscv_vslidedown_vx_f64m1(vec, 1, 2), 1); } + + +/*************************** + * Non-contiguous Load + ***************************/ +NPY_FINLINE npyv_s32 vld1q_lane_s32(const int32_t *a, npyv_s32 b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); + npyv_s32 a_dup = __riscv_vmv_v_x_i32m1(a[0], 4); + return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 4); +} + +NPY_FINLINE void vst1q_lane_s32(int32_t *a, npyv_s32 b, const int lane) { + npyv_s32 b_s = __riscv_vslidedown_vx_i32m1(b, lane, 4); + *a = __riscv_vmv_x_s_i32m1_i32(b_s); +} + +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ + npyv_s32 a = __riscv_vmv_v_x_i32m1(0, 4); + a = vld1q_lane_s32((const int32_t*)ptr, a, 0); + a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); + a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); + a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); + return a; +} + +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return npyv_reinterpret_u32_s32( + npyv_loadn_s32((const npy_int32*)ptr, stride) + ); +} +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ + return npyv_reinterpret_f32_s32( + npyv_loadn_s32((const npy_int32*)ptr, stride) + ); +} + +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ + return __riscv_vslideup_vx_i64m1( + __riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vle64_v_i64m1((const int64_t*)ptr + stride, 1), 1, 2 + ); +} +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ + return npyv_reinterpret_u64_s64( + npyv_loadn_s64((const npy_int64*)ptr, stride) + ); +} + +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ + return npyv_reinterpret_f64_s64( + npyv_loadn_s64((const npy_int64*)ptr, stride) + ); +} + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + return __riscv_vslideup_vx_u32m1( + __riscv_vle32_v_u32m1((const uint32_t*)ptr, 2), __riscv_vle32_v_u32m1((const uint32_t*)ptr + stride, 2), 2, 4 + ); +} + +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_u64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ (void)stride; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ (void)stride; return npyv_load_f64(ptr); } + +/*************************** + * Non-contiguous Store + ***************************/ +NPY_FINLINE npyv_s32 vld1_lane_s32(const int32_t *a, npyv_s32 b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); + npyv_s32 a_dup = __riscv_vmv_v_x_i32m1(a[0], 2); + return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 2); +} + +NPY_FINLINE npyv_u32 vld1_lane_u32(const uint32_t *a, npyv_u32 b, const int lane) { + vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); + npyv_u32 a_dup = __riscv_vmv_v_x_u32m1(a[0], 2); + return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 2); +} + +NPY_FINLINE void vst1q_lane_u64(uint64_t *a, npyv_u64 b, const int c) { + npyv_u64 b_s = __riscv_vslidedown_vx_u64m1(b, c, 1); + *a = __riscv_vmv_x_s_u64m1_u64(b_s); +} + +NPY_FINLINE void vst1q_lane_s64(int64_t *a, npyv_s64 b, const int c) { + npyv_s64 b_s = __riscv_vslidedown_vx_i64m1(b, c, 1); + *a = __riscv_vmv_x_s_i64m1_i64(b_s); +} + +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ + vst1q_lane_s32((int32_t*)ptr, a, 0); + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); +} + +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); } + +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); } + +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ + vst1q_lane_s64((int64_t*)ptr, a, 0); + vst1q_lane_s64((int64_t*)ptr + stride, a, 1); +} + +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); } + +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); } + + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0); + vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } + +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ (void)stride; npyv_store_u64(ptr, a); } + +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ (void)stride; npyv_store_s64(ptr, a); } + +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ (void)stride; npyv_store_f64(ptr, a); } + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + npyv_s32 a; + switch(nlane) { + case 1: + a = vld1q_lane_s32((const int32_t*)ptr, __riscv_vmv_v_x_i32m1(fill, 4), 0); + break; + case 2: + a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(fill, 2), 2, 4); + break; + case 3: + a = __riscv_vslideup_vx_i32m1( + __riscv_vle32_v_i32m1((const int32_t*)ptr, 2), + vld1_lane_s32((const int32_t*)ptr + 2, __riscv_vmv_v_x_i32m1(fill, 2), 0), 2, 4 + ); + break; + default: + return npyv_load_s32(ptr); + } +#if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s32 workaround = a; + a = __riscv_vor_vv_i32m1(workaround, a, 4); +#endif + return a; +} + +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_till_s32(ptr, nlane, 0); } + +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + npyv_s64 a = __riscv_vslideup_vx_i64m1(__riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vmv_v_x_i64m1(fill, 1), 1, 2); + #if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s64 workaround = a; + a = __riscv_vor_vv_i64m1(workaround, a, 2); + #endif + return a; + } + return npyv_load_s64(ptr); +} + +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ return npyv_load_till_s64(ptr, nlane, 0); } + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vle32_v_i32m1(fill, 2), 2, 4); + #if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s32 workaround = a; + a = __riscv_vor_vv_i32m1(workaround, a, 4); + #endif + return a; + } + return npyv_load_s32(ptr); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return __riscv_vreinterpret_v_i64m1_i32m1(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ (void)nlane; return npyv_load_s64(ptr); } + +/********************************* + * Non-contiguous partial load + *********************************/ +NPY_FINLINE npyv_s32 +npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ + assert(nlane > 0); + npyv_s32 vfill = __riscv_vmv_v_x_i32m1(fill, 4); + switch(nlane) { + case 3: + vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2); + case 2: + vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1); + case 1: + vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0); + break; + default: + return npyv_loadn_s32(ptr, stride); + } +#if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s32 workaround = vfill; + vfill = __riscv_vor_vv_i32m1(workaround, vfill, 4); +#endif + return vfill; +} + +NPY_FINLINE npyv_s32 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } + +NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ + assert(nlane > 0); + if (nlane == 1) { + return npyv_load_till_s64(ptr, 1, fill); + } + return npyv_loadn_s64(ptr, stride); +} + +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + assert(nlane > 0); + if (nlane == 1) { + const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; + npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vle32_v_i32m1(fill, 2), 2, 4); + #if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s32 workaround = a; + a = __riscv_vor_vv_i32m1(workaround, a, 4); + #endif + return a; + } + return npyv_loadn2_s32(ptr, stride); +} + +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ + assert(nlane > 0); + if (nlane == 1) { + npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(0, 2), 2, 4); + #if NPY_SIMD_GUARD_PARTIAL_LOAD + volatile npyv_s32 workaround = a; + a = __riscv_vor_vv_i32m1(workaround, a, 4); + #endif + return a; + } + return npyv_loadn2_s32(ptr, stride); +} + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } + +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } + +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + switch(nlane) { + case 1: + vst1q_lane_s32((int32_t*)ptr, a, 0); + break; + case 2: + __riscv_vse32_v_i32m1((int32_t*)ptr, a, 2); + break; + case 3: + __riscv_vse32_v_i32m1((int32_t*)ptr, a, 2); + vst1q_lane_s32((int32_t*)ptr + 2, a, 2); + break; + default: + npyv_store_s32(ptr, a); + } +} + +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + vst1q_lane_s64((int64_t*)ptr, a, 0); + return; + } + npyv_store_s64(ptr, a); +} + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + if (nlane == 1) { + // armhf strict to alignment, may cause bus error + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + return; + } + npyv_store_s32(ptr, a); +} + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); (void)nlane; + npyv_store_s64(ptr, a); +} + +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + vst1q_lane_s32((int32_t*)ptr, a, 0); + switch(nlane) { + case 1: + return; + case 2: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + return; + case 3: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + return; + default: + vst1q_lane_s32((int32_t*)ptr + stride, a, 1); + vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); + vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); + } +} +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + assert(nlane > 0); + if (nlane == 1) { + vst1q_lane_s64((int64_t*)ptr, a, 0); + return; + } + npyv_storen_s64(ptr, stride, a); +} + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + assert(nlane > 0); + vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); + if (nlane > 1) { + vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1); + } +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } + +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f64, s64) + +// 128-bit/64-bit stride +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f64, s64) + +/************************************************************ + * de-interleave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_RVV_MEM_INTERLEAVE(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + return npyv_unzip_##SFX( \ + npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ + ); \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \ + npyv_store_##SFX(ptr, zip.val[0]); \ + npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ + } + +NPYV_IMPL_RVV_MEM_INTERLEAVE(u8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f64) + +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of uint32. +NPY_FINLINE npyv_u32 vcreate_u32(uint64_t a) { + return __riscv_vreinterpret_v_u64m1_u32m1( + __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vmv_v_x_u64m1(a, 8)))); +} + +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ + const unsigned i0 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 0, 4)); + const unsigned i1 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 1, 4)); + const unsigned i2 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 2, 4)); + const unsigned i3 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 3, 4)); + + npyv_u32 low = vcreate_u32(table[i0]); + low = vld1_lane_u32((const uint32_t*)table + i1, low, 1); + npyv_u32 high = vcreate_u32(table[i2]); + high = vld1_lane_u32((const uint32_t*)table + i3, high, 1); + return __riscv_vslideup_vx_u32m1(low, high, 2, 4); +} + +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } + +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ return npyv_reinterpret_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of uint64. +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ + const unsigned i0 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(idx), 0, 4)); + const unsigned i1 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(idx), 2, 4)); + return __riscv_vslideup_vx_u64m1( + __riscv_vle64_v_u64m1((const uint64_t*)table + i0, 1), + __riscv_vle64_v_u64m1((const uint64_t*)table + i1, 1), 1, 2 + ); +} + +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } + +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ return npyv_reinterpret_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } + +#endif // _NPY_SIMD_RVV_MEMORY_H diff --git a/numpy/_core/src/common/simd/rvv/misc.h b/numpy/_core/src/common/simd/rvv/misc.h new file mode 100644 index 000000000000..223471ee8f74 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/misc.h @@ -0,0 +1,753 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MISC_H +#define _NPY_SIMD_RVV_MISC_H + +// vector with zero lanes +#define npyv_zero_u8() __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s8() __riscv_vreinterpret_v_i32m1_i8m1(npyv_zero_s32()) +#define npyv_zero_u16() __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s16() __riscv_vreinterpret_v_i32m1_i16m1(npyv_zero_s32()) +#define npyv_zero_u32() __riscv_vmv_v_x_u32m1((uint32_t)0, 4) +#define npyv_zero_s32() __riscv_vmv_v_x_i32m1((int32_t)0, 4) +#define npyv_zero_u64() __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s64() __riscv_vreinterpret_v_i32m1_i64m1(npyv_zero_s32()) +#define npyv_zero_f32() __riscv_vfmv_v_f_f32m1(0.0f, 4) +#define npyv_zero_f64() __riscv_vfmv_v_f_f64m1(0.0, 2) + +// vector with a specific value set to all lanes +NPY_FINLINE npyv_u8 npyv_setall_u8(uint8_t val) +{ + size_t vlen = __riscv_vsetvlmax_e8m1(); + return __riscv_vmv_v_x_u8m1(val, vlen); +} + +NPY_FINLINE npyv_s8 npyv_setall_s8(int8_t val) +{ + size_t vlen = __riscv_vsetvlmax_e8m1(); + return __riscv_vmv_v_x_i8m1(val, vlen); +} + +NPY_FINLINE npyv_u16 npyv_setall_u16(uint16_t val) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vmv_v_x_u16m1(val, vlen); +} + +NPY_FINLINE npyv_s16 npyv_setall_s16(int16_t val) +{ + size_t vlen = __riscv_vsetvlmax_e16m1(); + return __riscv_vmv_v_x_i16m1(val, vlen); +} + +NPY_FINLINE npyv_u32 npyv_setall_u32(uint32_t val) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vmv_v_x_u32m1(val, vlen); +} + +NPY_FINLINE npyv_s32 npyv_setall_s32(int32_t val) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vmv_v_x_i32m1(val, vlen); +} + +NPY_FINLINE npyv_u64 npyv_setall_u64(uint64_t val) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_v_x_u64m1(val, vlen); +} + +NPY_FINLINE npyv_s64 npyv_setall_s64(int64_t val) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vmv_v_x_i64m1(val, vlen); +} + +NPY_FINLINE npyv_f32 npyv_setall_f32(float val) +{ + size_t vlen = __riscv_vsetvlmax_e32m1(); + return __riscv_vfmv_v_f_f32m1(val, vlen); +} + +NPY_FINLINE npyv_f64 npyv_setall_f64(double val) +{ + size_t vlen = __riscv_vsetvlmax_e64m1(); + return __riscv_vfmv_v_f_f64m1(val, vlen); +} + +// vector with specific values set to each lane and +// set a specific value to all remained lanes +NPY_FINLINE npyv_u8 npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, + npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, + npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) +{ + const uint8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return __riscv_vle8_v_u8m1(data, 16); +} + +NPY_FINLINE npyv_s8 npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, + npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, + npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) +{ + const int8_t NPY_DECL_ALIGNED(16) data[16] = { + i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 + }; + return __riscv_vle8_v_i8m1(data, 16); +} + +NPY_FINLINE vuint16m1_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, + npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) +{ + const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return __riscv_vle16_v_u16m1(data, 8); +} + +NPY_FINLINE vint16m1_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, + npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) +{ + const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + return __riscv_vle16_v_i16m1(data, 8); +} + +NPY_FINLINE npyv_u32 npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) +{ + const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return __riscv_vle32_v_u32m1(data, 4); +} + +NPY_FINLINE vint32m1_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) +{ + const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return __riscv_vle32_v_i32m1(data, 4); +} + +NPY_FINLINE npyv_u64 npyv__set_u64(npy_uint64 i0, npy_uint64 i1) +{ + const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return __riscv_vle64_v_u64m1(data, 2); +} + +NPY_FINLINE vint64m1_t npyv__set_s64(npy_int64 i0, npy_int64 i1) +{ + const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return __riscv_vle64_v_i64m1(data, 2); +} + +NPY_FINLINE vfloat32m1_t npyv__set_f32(float i0, float i1, float i2, float i3) +{ + const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; + return __riscv_vle32_v_f32m1(data, 4); +} + +NPY_FINLINE vfloat64m1_t npyv__set_f64(double i0, double i1) +{ + const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; + return __riscv_vle64_v_f64m1(data, 2); +} + +#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +NPY_FINLINE npyv_u8 npyv_select_u8(npyv_u8 a, npyv_u8 b, npyv_u8 c) +{ + return __riscv_vxor_vv_u8m1(__riscv_vand_vv_u8m1(__riscv_vxor_vv_u8m1(c, b, 16), a, 16), c, 16); +} + +NPY_FINLINE npyv_s8 npyv_select_s8(npyv_u8 a, npyv_s8 b, npyv_s8 c) +{ + return __riscv_vxor_vv_i8m1( + __riscv_vand_vv_i8m1(__riscv_vxor_vv_i8m1(c, b, 16), __riscv_vreinterpret_v_u8m1_i8m1(a), 16), c, 16); +} + +NPY_FINLINE npyv_u16 npyv_select_u16(npyv_u16 a, npyv_u16 b, npyv_u16 c) +{ + return __riscv_vxor_vv_u16m1(__riscv_vand_vv_u16m1(__riscv_vxor_vv_u16m1(c, b, 8), a, 8), c, 8); +} + +NPY_FINLINE npyv_s16 npyv_select_s16(npyv_u16 a, npyv_s16 b, npyv_s16 c) +{ + return __riscv_vxor_vv_i16m1( + __riscv_vand_vv_i16m1(__riscv_vxor_vv_i16m1(c, b, 8), __riscv_vreinterpret_v_u16m1_i16m1(a), 8), c, 8); +} + +NPY_FINLINE npyv_u32 npyv_select_u32(npyv_u32 a, npyv_u32 b, npyv_u32 c) +{ + return __riscv_vxor_vv_u32m1(__riscv_vand_vv_u32m1(__riscv_vxor_vv_u32m1(c, b, 4), a, 4), c, 4); +} + +NPY_FINLINE npyv_s32 npyv_select_s32(npyv_u32 a, npyv_s32 b, npyv_s32 c) +{ + return __riscv_vxor_vv_i32m1( + __riscv_vand_vv_i32m1(__riscv_vxor_vv_i32m1(c, b, 4), __riscv_vreinterpret_v_u32m1_i32m1(a), 4), c, 4); +} + +NPY_FINLINE npyv_u64 npyv_select_u64(npyv_u64 a, npyv_u64 b, npyv_u64 c) +{ + return __riscv_vxor_vv_u64m1(__riscv_vand_vv_u64m1(__riscv_vxor_vv_u64m1(c, b, 2), a, 2), c, 2); +} + +NPY_FINLINE npyv_s64 npyv_select_s64(npyv_u64 a, npyv_s64 b, npyv_s64 c) +{ + return __riscv_vxor_vv_i64m1( + __riscv_vand_vv_i64m1(__riscv_vxor_vv_i64m1(c, b, 2), __riscv_vreinterpret_v_u64m1_i64m1(a), 2), c, 2); +} + +NPY_FINLINE npyv_f32 npyv_select_f32(npyv_u32 a, npyv_f32 b, npyv_f32 c) +{ + npyv_u32 b_u32 = __riscv_vreinterpret_v_f32m1_u32m1(b); + npyv_u32 c_u32 = __riscv_vreinterpret_v_f32m1_u32m1(c); + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vxor_vv_u32m1(__riscv_vand_vv_u32m1(__riscv_vxor_vv_u32m1(c_u32, b_u32, 4), a, 4), c_u32, 4)); +} + +NPY_FINLINE npyv_f64 npyv_select_f64(npyv_u64 a, npyv_f64 b, npyv_f64 c) +{ + npyv_u64 b_u64 = __riscv_vreinterpret_v_f64m1_u64m1(b); + npyv_u64 c_u64 = __riscv_vreinterpret_v_f64m1_u64m1(c); + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vxor_vv_u64m1(__riscv_vand_vv_u64m1(__riscv_vxor_vv_u64m1(c_u64, b_u64, 2), a, 2), c_u64, 2)); +} + +// extract the first vector's lane +NPY_FINLINE npy_uint8 npyv_extract0_u8(npyv_u8 a) +{ + return __riscv_vmv_x_s_u8m1_u8(__riscv_vslidedown_vx_u8m1(a, 0, 16)); +} + +NPY_FINLINE npy_int8 npyv_extract0_s8(npyv_s8 a) +{ + return __riscv_vmv_x_s_i8m1_i8(__riscv_vslidedown_vx_i8m1(a, 0, 16)); +} + +NPY_FINLINE npy_uint16 npyv_extract0_u16(npyv_u16 a) +{ + return __riscv_vmv_x_s_u16m1_u16(__riscv_vslidedown_vx_u16m1(a, 0, 8)); +} + +NPY_FINLINE npy_int16 npyv_extract0_s16(npyv_s16 a) +{ + return __riscv_vmv_x_s_i16m1_i16(__riscv_vslidedown_vx_i16m1(a, 0, 8)); +} + +NPY_FINLINE npy_uint32 npyv_extract0_u32(npyv_u32 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(a, 0, 4)); +} + +NPY_FINLINE npy_int32 npyv_extract0_s32(npyv_s32 a) +{ + return __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(a, 0, 4)); +} + +NPY_FINLINE npy_uint64 npyv_extract0_u64(npyv_u64 a) +{ + return __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, 0, 2)); +} + +NPY_FINLINE npy_int64 npyv_extract0_s64(npyv_s64 a) +{ + return __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(a, 0, 2)); +} + +NPY_FINLINE float npyv_extract0_f32(npyv_f32 a) +{ + return __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(a, 0, 4)); +} + +NPY_FINLINE double npyv_extract0_f64(npyv_f64 a) +{ + return __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(a, 0, 2)); +} + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i8m1_u8m1(a); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u16m1_u8m1(a); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_u16m1_u8m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u32m1_u8m1(a); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u64m1_u8m1(a); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); +} + +#define npyv_reinterpret_s8_s8(X) X +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u8m1_i8m1(a); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_i16m1_i8m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i16m1_i8m1(a); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i32m1_i8m1(a); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i64m1_i8m1(a); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); +} + +#define npyv_reinterpret_u16_u16(X) X +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u8m1_u16m1(a); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_u8m1_u16m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i16m1_u16m1(a); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u32m1_u16m1(a); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u64m1_u16m1(a); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); +} + +#define npyv_reinterpret_s16_s16(X) X +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_i8m1_i16m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i8m1_i16m1(a); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u16m1_i16m1(a); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i32m1_i16m1(a); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i64m1_i16m1(a); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); +} + +#define npyv_reinterpret_u32_u32(X) X +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u8m1_u32m1(a); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u16m1_u32m1(a); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i32m1_u32m1(a); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u64m1_u32m1(a); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_f32m1_u32m1(a); +} + +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); +} + +#define npyv_reinterpret_s32_s32(X) X +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_i8m1_i32m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i8m1_i32m1(a); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i16m1_i32m1(a); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u32m1_i32m1(a); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i64m1_i32m1(a); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_f32m1_i32m1(a); +} + +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); +} + +#define npyv_reinterpret_u64_u64(X) X +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u8m1_u64m1(a); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_u8m1_u64m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u16m1_u64m1(a); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u32m1_u64m1(a); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i64m1_u64m1(a); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); +} + +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_f64m1_u64m1(a); +} + +#define npyv_reinterpret_s64_s64(X) X +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_i8m1_i64m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i8m1_i64m1(a); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_i16m1_i64m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i16m1_i64m1(a); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i32m1_i64m1(a); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u64m1_i64m1(a); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); +} + +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_f64m1_i64m1(a); +} + +#define npyv_reinterpret_f32_f32(X) X +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1(a); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i32m1_f32m1(a); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(a)); +} + +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a))); +} + +#define npyv_reinterpret_f64_f64(X) X +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u8(npyv_u8 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u8m1_u64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s8(npyv_s8 a) +{ + return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i8m1_i64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u16(npyv_u16 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s16(npyv_s16 a) +{ + return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i16m1_i64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u32(npyv_u32 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s32(npyv_s32 a) +{ + return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(a)); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u64(npyv_u64 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1(a); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_i64m1_f64m1(a); +} + +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a))); +} + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif // _NPY_SIMD_RVV_MISC_H diff --git a/numpy/_core/src/common/simd/rvv/operators.h b/numpy/_core/src/common/simd/rvv/operators.h new file mode 100644 index 000000000000..a41e605093e0 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/operators.h @@ -0,0 +1,944 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_OPERATORS_H +#define _NPY_SIMD_RVV_OPERATORS_H + +/*************************** + * Shifting + ***************************/ +// left +NPY_FINLINE npyv_u16 npyv_shl_u16(npyv_u16 a, int16_t c) +{ + npyv_s16 b = npyv_setall_s16(c); + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); +} + +NPY_FINLINE npyv_s16 npyv_shl_s16(npyv_s16 a, int16_t c) +{ + npyv_s16 b = npyv_setall_s16(c); + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); +} + +NPY_FINLINE npyv_u32 npyv_shl_u32(npyv_u32 a, int32_t c) +{ + npyv_s32 b = npyv_setall_s32(c); + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); +} + +NPY_FINLINE npyv_s32 npyv_shl_s32(npyv_s32 a, int32_t c) +{ + npyv_s32 b = npyv_setall_s32(c); + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); +} + +NPY_FINLINE npyv_u64 npyv_shl_u64(npyv_u64 a, int64_t c) +{ + npyv_s64 b = npyv_setall_s64(c); + vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); + npyv_u64 shl = __riscv_vsll_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); + npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); + npyv_u64 shr = __riscv_vsrl_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); + return __riscv_vmerge_vvm_u64m1(shr, shl, positive_mask, 2); +} + +NPY_FINLINE npyv_s64 npyv_shl_s64(npyv_s64 a, int64_t c) +{ + npyv_s64 b = npyv_setall_s64(c); + vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); + npyv_s64 shl = __riscv_vsll_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); + npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); + npyv_s64 shr = __riscv_vsra_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); + return __riscv_vmerge_vvm_i64m1(shr, shl, positive_mask, 2); +} + +// left by an immediate constant +NPY_FINLINE npyv_u16 npyv_shli_u16(npyv_u16 a, const int b) +{ + return __riscv_vsll_vx_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_shli_s16(npyv_s16 a, const int b) +{ + return __riscv_vsll_vx_i16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_shli_u32(npyv_u32 a, const int b) +{ + return __riscv_vsll_vx_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_s32 npyv_shli_s32(npyv_s32 a, const int b) +{ + return __riscv_vsll_vx_i32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_shli_u64(npyv_u64 a, const int b) +{ + return __riscv_vsll_vx_u64m1(a, b, 2); +} + +NPY_FINLINE npyv_s64 npyv_shli_s64(npyv_s64 a, const int b) +{ + return __riscv_vsll_vx_i64m1(a, b, 2); +} + +// right +NPY_FINLINE npyv_u16 npyv_shr_u16(npyv_u16 a, int16_t c) +{ + npyv_s16 b = npyv_setall_s16(-(c)); + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); +} + +NPY_FINLINE npyv_s16 npyv_shr_s16(npyv_s16 a, int16_t c) +{ + npyv_s16 b = npyv_setall_s16(-(c)); + vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); + npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); + vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); + npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); + npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); + return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); +} + +NPY_FINLINE npyv_u32 npyv_shr_u32(npyv_u32 a, int32_t c) +{ + npyv_s32 b = npyv_setall_s32(-(c)); + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); +} + +NPY_FINLINE npyv_s32 npyv_shr_s32(npyv_s32 a, int32_t c) +{ + npyv_s32 b = npyv_setall_s32(-(c)); + vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); + npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); + vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); + npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); + npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); + return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); +} + +NPY_FINLINE npyv_u64 npyv_shr_u64(npyv_u64 a, int64_t c) +{ + npyv_s64 b = npyv_setall_s64(-(c)); + // implementation only works within defined range 'b' in [0, 63] + vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); + npyv_u64 shl = __riscv_vsll_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); + npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); + npyv_u64 shr = __riscv_vsrl_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); + return __riscv_vmerge_vvm_u64m1(shr, shl, positive_mask, 2); +} + +NPY_FINLINE npyv_s64 npyv_shr_s64(npyv_s64 a, int64_t c) +{ + npyv_s64 b = npyv_setall_s64(-(c)); + // implementation only works within defined range 'b' in [0, 63] + vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); + npyv_s64 shl = __riscv_vsll_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); + npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); + npyv_s64 shr = __riscv_vsra_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); + return __riscv_vmerge_vvm_i64m1(shr, shl, positive_mask, 2); +} + +// right by an immediate constant +NPY_FINLINE npyv_u16 npyv_shri_u16(npyv_u16 a, const int b) { + const int b_half = b >> 1; + npyv_u16 srl1 = __riscv_vsrl_vx_u16m1(a, b_half, 8); + return __riscv_vsrl_vx_u16m1(srl1, b_half + (b & 0x1), 8); +} + +NPY_FINLINE npyv_s16 npyv_shri_s16(npyv_s16 a, const int b) { + const int imm = b - (b >> 4); + return __riscv_vsra_vx_i16m1(a, imm, 8); +} + +NPY_FINLINE npyv_u32 npyv_shri_u32(npyv_u32 a, const int b) { + const int b_half = b >> 1; + npyv_u32 srl1 = __riscv_vsrl_vx_u32m1(a, b_half, 4); + return __riscv_vsrl_vx_u32m1(srl1, b_half + (b & 0x1), 4); +} + +NPY_FINLINE npyv_s32 npyv_shri_s32(npyv_s32 a, const int b) { + const int imm = b - (b >> 5); + return __riscv_vsra_vx_i32m1(a, imm, 4); +} + +NPY_FINLINE npyv_u64 npyv_shri_u64(npyv_u64 a, const int b) { + const int b_half = b >> 1; + npyv_u64 srl1 = __riscv_vsrl_vx_u64m1(a, b_half, 2); + return __riscv_vsrl_vx_u64m1(srl1, b_half + (b & 0x1), 2); +} + +NPY_FINLINE npyv_s64 npyv_shri_s64(npyv_s64 a, const int b) { + const int imm = b - (b >> 6); + return __riscv_vsra_vx_i64m1(a, imm, 2); +} + +/*************************** + * Logical + ***************************/ +// AND +NPY_FINLINE npyv_u8 npyv_and_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vand_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_and_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vand_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_and_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vand_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_and_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vand_vv_i16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_and_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vand_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_s32 npyv_and_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vand_vv_i32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_and_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vand_vv_u64m1(a, b, 2); +} + +NPY_FINLINE npyv_s64 npyv_and_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vand_vv_i64m1(a, b, 2); +} + +NPY_FINLINE npyv_f32 npyv_and_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + 4 + ) + ); +} + +NPY_FINLINE npyv_f64 npyv_and_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + 2 + ) + ); +} + +NPY_FINLINE npyv_u8 npyv_and_b8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vand_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_and_b16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vand_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_and_b32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vand_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_and_b64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vand_vv_u64m1(a, b, 2); +} + +// OR +NPY_FINLINE npyv_u8 npyv_or_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vor_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_or_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vor_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_or_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vor_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_or_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vor_vv_i16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_or_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vor_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_s32 npyv_or_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vor_vv_i32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_or_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vor_vv_u64m1(a, b, 2); +} + +NPY_FINLINE npyv_s64 npyv_or_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vor_vv_i64m1(a, b, 2); +} + +NPY_FINLINE npyv_f32 npyv_or_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + 4 + ) + ); +} + +NPY_FINLINE npyv_f64 npyv_or_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + 2 + ) + ); +} + +NPY_FINLINE npyv_u8 npyv_or_b8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vor_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_or_b16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vor_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_or_b32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vor_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_or_b64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vor_vv_u64m1(a, b, 2); +} + +// XOR +NPY_FINLINE npyv_u8 npyv_xor_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vxor_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_s8 npyv_xor_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vxor_vv_i8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_xor_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vxor_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_s16 npyv_xor_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vxor_vv_i16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_xor_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vxor_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_s32 npyv_xor_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vxor_vv_i32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_xor_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vxor_vv_u64m1(a, b, 2); +} + +NPY_FINLINE npyv_s64 npyv_xor_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vxor_vv_i64m1(a, b, 2); +} + +NPY_FINLINE npyv_f32 npyv_xor_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + 4 + ) + ); +} + +NPY_FINLINE npyv_f64 npyv_xor_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + 2 + ) + ); +} + +NPY_FINLINE npyv_u8 npyv_xor_b8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vxor_vv_u8m1(a, b, 16); +} + +NPY_FINLINE npyv_u16 npyv_xor_b16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vxor_vv_u16m1(a, b, 8); +} + +NPY_FINLINE npyv_u32 npyv_xor_b32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vxor_vv_u32m1(a, b, 4); +} + +NPY_FINLINE npyv_u64 npyv_xor_b64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vxor_vv_u64m1(a, b, 2); +} + +// NOT +NPY_FINLINE npyv_u8 npyv_not_u8(npyv_u8 a) +{ + return __riscv_vnot_v_u8m1(a, 16); +} + +NPY_FINLINE npyv_s8 npyv_not_s8(npyv_s8 a) +{ + return __riscv_vnot_v_i8m1(a, 16); +} + +NPY_FINLINE npyv_u16 npyv_not_u16(npyv_u16 a) +{ + return __riscv_vnot_v_u16m1(a, 8); +} + +NPY_FINLINE npyv_s16 npyv_not_s16(npyv_s16 a) +{ + return __riscv_vnot_v_i16m1(a, 8); +} + +NPY_FINLINE npyv_u32 npyv_not_u32(npyv_u32 a) +{ + return __riscv_vnot_v_u32m1(a, 4); +} + +NPY_FINLINE npyv_s32 npyv_not_s32(npyv_s32 a) +{ + return __riscv_vnot_v_i32m1(a, 4); +} + +NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a) +{ + return __riscv_vnot_v_u64m1(a, 2); +} + +NPY_FINLINE npyv_s64 npyv_not_s64(npyv_s64 a) +{ + return __riscv_vreinterpret_v_u64m1_i64m1( + __riscv_vnot_v_u64m1( + __riscv_vreinterpret_v_i64m1_u64m1(a), + 2 + ) + ); +} + +NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vnot_v_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + 4 + ) + ); +} + +NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vnot_v_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + 2 + ) + ); +} + +NPY_FINLINE npyv_u8 npyv_not_b8(npyv_u8 a) +{ + return __riscv_vnot_v_u8m1(a, 16); +} + +NPY_FINLINE npyv_u16 npyv_not_b16(npyv_u16 a) +{ + return __riscv_vnot_v_u16m1(a, 8); +} + +NPY_FINLINE npyv_u32 npyv_not_b32(npyv_u32 a) +{ + return __riscv_vnot_v_u32m1(a, 4); +} + +#define npyv_not_b64 npyv_not_u64 + +// ANDC, ORC and XNOR +NPY_FINLINE npyv_u8 npyv_andc_u8(npyv_u8 a, npyv_u8 b) { + return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); +} + +NPY_FINLINE npyv_u8 npyv_andc_b8(npyv_u8 a, npyv_u8 b) { + return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); +} + +NPY_FINLINE npyv_u8 npyv_orc_b8(npyv_u8 a, npyv_u8 b) { + return __riscv_vor_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); +} + +NPY_FINLINE npyv_u8 npyv_xnor_b8(npyv_u8 a, npyv_u8 b) { + vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +/*************************** + * Comparison + ***************************/ +// equal +NPY_FINLINE npyv_u8 npyv_cmpeq_u8(npyv_u8 a, npyv_u8 b) { + vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u16 npyv_cmpeq_u16(npyv_u16 a, npyv_u16 b) { + vbool16_t cmp_res = __riscv_vmseq_vv_u16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u32 npyv_cmpeq_u32(npyv_u32 a, npyv_u32 b) { + vbool32_t cmp_res = __riscv_vmseq_vv_u32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u8 npyv_cmpeq_s8(npyv_s8 a, npyv_s8 b) { + vbool8_t cmp_res = __riscv_vmseq_vv_i8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u16 npyv_cmpeq_s16(npyv_s16 a, npyv_s16 b) { + vbool16_t cmp_res = __riscv_vmseq_vv_i16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u32 npyv_cmpeq_s32(npyv_s32 a, npyv_s32 b) { + vbool32_t cmp_res = __riscv_vmseq_vv_i32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpeq_f32(vfloat32m1_t a, vfloat32m1_t b) { + vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u64 npyv_cmpeq_f64(vfloat64m1_t a, vfloat64m1_t b) { + vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpeq_u64(npyv_u64 a, npyv_u64 b) { + vbool64_t cmp_res = __riscv_vmseq_vv_u64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpeq_s64(npyv_s64 a, npyv_s64 b) { + vbool64_t cmp_res = __riscv_vmseq_vv_i64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +// not Equal +NPY_FINLINE npyv_u8 npyv_cmpneq_u8(npyv_u8 a, npyv_u8 b) { + vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); + return __riscv_vnot_v_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16), 16); +} + +NPY_FINLINE npyv_u8 npyv_cmpneq_s8(npyv_s8 a, npyv_s8 b) { + vbool8_t cmp_res = __riscv_vmseq_vv_i8m1_b8(a, b, 16); + return __riscv_vnot_v_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16), 16); +} + +NPY_FINLINE npyv_u16 npyv_cmpneq_u16(npyv_u16 a, npyv_u16 b) { + vbool16_t cmp_res = __riscv_vmseq_vv_u16m1_b16(a, b, 8); + return __riscv_vnot_v_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8), 8); +} + +NPY_FINLINE npyv_u16 npyv_cmpneq_s16(npyv_s16 a, npyv_s16 b) { + vbool16_t cmp_res = __riscv_vmseq_vv_i16m1_b16(a, b, 8); + return __riscv_vnot_v_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8), 8); +} + +NPY_FINLINE npyv_u32 npyv_cmpneq_u32(npyv_u32 a, npyv_u32 b) { + vbool32_t cmp_res = __riscv_vmseq_vv_u32m1_b32(a, b, 4); + return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpneq_s32(npyv_s32 a, npyv_s32 b) { + vbool32_t cmp_res = __riscv_vmseq_vv_i32m1_b32(a, b, 4); + return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); +} + +#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B)) +#define npyv_cmpneq_s64(A, B) npyv_not_u64(npyv_cmpeq_s64(A, B)) + +NPY_FINLINE npyv_u32 npyv_cmpneq_f32(vfloat32m1_t a, vfloat32m1_t b) { + vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, b, 4); + return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); +} + +#define npyv_cmpneq_f64(A, B) npyv_not_u64(__riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), __riscv_vmfeq_vv_f64m1_b64(A, B, 2), 2)) + +// greater than +NPY_FINLINE npyv_u8 npyv_cmpgt_u8(npyv_u8 a, npyv_u8 b) { + vbool8_t cmp_res = __riscv_vmsgtu_vv_u8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u8 npyv_cmpgt_s8(npyv_s8 a, npyv_s8 b) { + vbool8_t cmp_res = __riscv_vmsgt_vv_i8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u16 npyv_cmpgt_u16(npyv_u16 a, npyv_u16 b) { + vbool16_t cmp_res = __riscv_vmsgtu_vv_u16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u16 npyv_cmpgt_s16(npyv_s16 a, npyv_s16 b) { + vbool16_t cmp_res = __riscv_vmsgt_vv_i16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u32 npyv_cmpgt_u32(npyv_u32 a, npyv_u32 b) { + vbool32_t cmp_res = __riscv_vmsgtu_vv_u32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpgt_s32(npyv_s32 a, npyv_s32 b) { + vbool32_t cmp_res = __riscv_vmsgt_vv_i32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpgt_f32(vfloat32m1_t a, vfloat32m1_t b) { + vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u64 npyv_cmpgt_f64(vfloat64m1_t a, vfloat64m1_t b) { + vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpgt_u64(npyv_u64 a, npyv_u64 b) { + vbool64_t cmp_res = __riscv_vmsgtu_vv_u64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpgt_s64(npyv_s64 a, npyv_s64 b) { + vbool64_t cmp_res = __riscv_vmsgt_vv_i64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +// greater than or equal +NPY_FINLINE npyv_u8 npyv_cmpge_u8(npyv_u8 a, npyv_u8 b) { + vbool8_t cmp_res = __riscv_vmsgeu_vv_u8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u8 npyv_cmpge_s8(npyv_s8 a, npyv_s8 b) { + vbool8_t cmp_res = __riscv_vmsge_vv_i8m1_b8(a, b, 16); + return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); +} + +NPY_FINLINE npyv_u16 npyv_cmpge_u16(npyv_u16 a, npyv_u16 b) { + vbool16_t cmp_res = __riscv_vmsgeu_vv_u16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u16 npyv_cmpge_s16(npyv_s16 a, npyv_s16 b) { + vbool16_t cmp_res = __riscv_vmsge_vv_i16m1_b16(a, b, 8); + return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); +} + +NPY_FINLINE npyv_u32 npyv_cmpge_u32(npyv_u32 a, npyv_u32 b) { + vbool32_t cmp_res = __riscv_vmsgeu_vv_u32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpge_s32(npyv_s32 a, npyv_s32 b) { + vbool32_t cmp_res = __riscv_vmsge_vv_i32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u32 npyv_cmpge_f32(vfloat32m1_t a, vfloat32m1_t b) { + vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(a, b, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_u64 npyv_cmpge_f64(vfloat64m1_t a, vfloat64m1_t b) { + vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpge_u64(npyv_u64 a, npyv_u64 b) { + vbool64_t cmp_res = __riscv_vmsgeu_vv_u64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +NPY_FINLINE npyv_u64 npyv_cmpge_s64(npyv_s64 a, npyv_s64 b) { + vbool64_t cmp_res = __riscv_vmsge_vv_i64m1_b64(a, b, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) +#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) +#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) + +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ + vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, a, 4); + return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); +} + +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ + vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(a, a, 2); + return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); +} + +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +NPY_FINLINE bool npyv_any_b8(npyv_u8 a) +{ + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(0, 16), 16)) != 0; +} +NPY_FINLINE bool npyv_all_b8(npyv_u8 a) +{ + return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)) != 0; +} + +NPY_FINLINE bool npyv_any_b16(npyv_b16 a) +{ + return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 8)) != 0; +} + +NPY_FINLINE bool npyv_all_b16(npyv_b16 a) +{ + return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), 8)) != 0; +} + +NPY_FINLINE bool npyv_any_b32(npyv_b32 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)) != 0; +} + +NPY_FINLINE bool npyv_all_b32(npyv_b32 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; +} + +NPY_FINLINE bool npyv_any_u8(npyv_u8 a) +{ + return npyv_any_b8(npyv_reinterpret_u8_u8(a)); +} + +NPY_FINLINE bool npyv_all_u8(npyv_u8 a) +{ + return npyv_all_b8(npyv_reinterpret_u8_u8(a)); +} + +NPY_FINLINE bool npyv_any_s8(npyv_s8 a) +{ + return npyv_any_b8(npyv_reinterpret_u8_s8(a)); +} + +NPY_FINLINE bool npyv_all_s8(npyv_s8 a) +{ + return npyv_all_b8(npyv_reinterpret_u8_s8(a)); +} + +NPY_FINLINE bool npyv_any_u16(npyv_u16 a) +{ + return npyv_any_b16(npyv_reinterpret_u16_u16(a)); +} + +NPY_FINLINE bool npyv_all_u16(npyv_u16 a) +{ + return npyv_all_b16(npyv_reinterpret_u16_u16(a)); +} + +NPY_FINLINE bool npyv_any_s16(npyv_s16 a) +{ + return npyv_any_b16(npyv_reinterpret_u16_s16(a)); +} + +NPY_FINLINE bool npyv_all_s16(npyv_s16 a) +{ + return npyv_all_b16(npyv_reinterpret_u16_s16(a)); +} + +NPY_FINLINE bool npyv_any_u32(npyv_u32 a) +{ + return npyv_any_b32(npyv_reinterpret_u32_u32(a)); +} + +NPY_FINLINE bool npyv_all_u32(npyv_u32 a) +{ + return npyv_all_b32(npyv_reinterpret_u32_u32(a)); +} + +NPY_FINLINE bool npyv_any_s32(npyv_s32 a) +{ + return npyv_any_b32(npyv_reinterpret_u32_s32(a)); +} + +NPY_FINLINE bool npyv_all_s32(npyv_s32 a) +{ + return npyv_all_b32(npyv_reinterpret_u32_s32(a)); +} + +NPY_FINLINE bool npyv_any_b64(npyv_b64 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(a), __riscv_vmv_v_x_u32m1(0, 4), 4)) != 0; +} + +NPY_FINLINE bool npyv_all_b64(npyv_b64 a) +{ + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(a), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; +} + +#define npyv_any_u64 npyv_any_b64 + +NPY_FINLINE npyv_u32 vrev64q_u32(npyv_u32 a) { + npyv_u32 vid = __riscv_vid_v_u32m1(2); + npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); + npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); + npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); + return __riscv_vrgather_vv_u32m1(a, idxs, 4); +} + +NPY_FINLINE bool npyv_all_u64(npyv_u64 a) +{ + npyv_u32 a32 = __riscv_vreinterpret_v_u64m1_u32m1(a); + a32 = __riscv_vor_vv_u32m1(a32, vrev64q_u32(a32), 4); + return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a32, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; +} + +NPY_FINLINE bool npyv_any_s64(npyv_s64 a) +{ + return npyv_any_u64(__riscv_vreinterpret_v_i64m1_u64m1(a)); +} + +NPY_FINLINE bool npyv_all_s64(npyv_s64 a) +{ + return npyv_all_u64(__riscv_vreinterpret_v_i64m1_u64m1(a)); +} + +NPY_FINLINE bool npyv_any_f32(npyv_f32 a) +{ + return !npyv_all_b32(npyv_cmpeq_f32(a, npyv_zero_f32())); +} + +NPY_FINLINE bool npyv_all_f32(npyv_f32 a) +{ + return !npyv_any_b32(npyv_cmpeq_f32(a, npyv_zero_f32())); +} + +NPY_FINLINE bool npyv_any_f64(npyv_f64 a) +{ + return !npyv_all_b64(npyv_cmpeq_f64(a, npyv_zero_f64())); +} + +NPY_FINLINE bool npyv_all_f64(npyv_f64 a) +{ + return !npyv_any_b64(npyv_cmpeq_f64(a, npyv_zero_f64())); +} + +#endif // _NPY_SIMD_RVV_OPERATORS_H diff --git a/numpy/_core/src/common/simd/rvv/reorder.h b/numpy/_core/src/common/simd/rvv/reorder.h new file mode 100644 index 000000000000..25ebf62851a5 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/reorder.h @@ -0,0 +1,970 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_REORDER_H +#define _NPY_SIMD_RVV_REORDER_H + +// combine lower part of two vectors +NPY_FINLINE npyv_u8 npyv_combinel_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vslideup_vx_u8m1(a, b, 8, 16); +} + +NPY_FINLINE npyv_s8 npyv_combinel_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vslideup_vx_i8m1(a, b, 8, 16); +} + +NPY_FINLINE npyv_u16 npyv_combinel_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vslideup_vx_u16m1(a, b, 4, 8); +} + +NPY_FINLINE npyv_s16 npyv_combinel_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vslideup_vx_i16m1(a, b, 4, 8); +} + +NPY_FINLINE npyv_u32 npyv_combinel_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vslideup_vx_u32m1(a, b, 2, 4); +} + +NPY_FINLINE npyv_s32 npyv_combinel_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vslideup_vx_i32m1(a, b, 2, 4); +} + +NPY_FINLINE npyv_u64 npyv_combinel_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vslideup_vx_u64m1(a, b, 1, 2); +} + +NPY_FINLINE npyv_s64 npyv_combinel_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vslideup_vx_i64m1(a, b, 1, 2); +} + +NPY_FINLINE npyv_f32 npyv_combinel_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vslideup_vx_f32m1(a, b, 2, 4); +} + +NPY_FINLINE npyv_f64 npyv_combinel_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vslideup_vx_f64m1(a, b, 1, 2); +} + +// combine higher part of two vectors +NPY_FINLINE npyv_u8 npyv_combineh_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vslideup_vx_u8m1( + __riscv_vslidedown_vx_u8m1(a, 8, 16), + __riscv_vslidedown_vx_u8m1(b, 8, 16), + 8, + 16 + ); +} + +NPY_FINLINE npyv_u16 npyv_combineh_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vslideup_vx_u16m1( + __riscv_vslidedown_vx_u16m1(a, 4, 8), + __riscv_vslidedown_vx_u16m1(b, 4, 8), + 4, + 8 + ); +} + +NPY_FINLINE npyv_u32 npyv_combineh_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vslideup_vx_u32m1( + __riscv_vslidedown_vx_u32m1(a, 2, 4), + __riscv_vslidedown_vx_u32m1(b, 2, 4), + 2, + 4 + ); +} + +NPY_FINLINE npyv_u64 npyv_combineh_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vslideup_vx_u64m1( + __riscv_vslidedown_vx_u64m1(a, 1, 2), + __riscv_vslidedown_vx_u64m1(b, 1, 2), + 1, + 2 + ); +} + +NPY_FINLINE npyv_s8 npyv_combineh_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vslideup_vx_i8m1( + __riscv_vslidedown_vx_i8m1(a, 8, 16), + __riscv_vslidedown_vx_i8m1(b, 8, 16), + 8, + 16 + ); +} + +NPY_FINLINE npyv_s16 npyv_combineh_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vslideup_vx_i16m1( + __riscv_vslidedown_vx_i16m1(a, 4, 8), + __riscv_vslidedown_vx_i16m1(b, 4, 8), + 4, + 8 + ); +} + +NPY_FINLINE npyv_s32 npyv_combineh_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vslideup_vx_i32m1( + __riscv_vslidedown_vx_i32m1(a, 2, 4), + __riscv_vslidedown_vx_i32m1(b, 2, 4), + 2, + 4 + ); +} + +NPY_FINLINE npyv_s64 npyv_combineh_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vslideup_vx_i64m1( + __riscv_vslidedown_vx_i64m1(a, 1, 2), + __riscv_vslidedown_vx_i64m1(b, 1, 2), + 1, + 2 + ); +} + +NPY_FINLINE npyv_f32 npyv_combineh_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vslideup_vx_f32m1( + __riscv_vslidedown_vx_f32m1(a, 2, 4), + __riscv_vslidedown_vx_f32m1(b, 2, 4), + 2, + 4 + ); +} + +NPY_FINLINE npyv_f64 npyv_combineh_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vslideup_vx_f64m1( + __riscv_vslidedown_vx_f64m1(a, 1, 2), + __riscv_vslidedown_vx_f64m1(b, 1, 2), + 1, + 2 + ); +} + +// combine two vectors from lower and higher parts of two other vectors +#define NPYV_IMPL_RVV_COMBINE(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ + r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ + return r; \ + } + +NPYV_IMPL_RVV_COMBINE(npyv_u8, u8) +NPYV_IMPL_RVV_COMBINE(npyv_s8, s8) +NPYV_IMPL_RVV_COMBINE(npyv_u16, u16) +NPYV_IMPL_RVV_COMBINE(npyv_s16, s16) +NPYV_IMPL_RVV_COMBINE(npyv_u32, u32) +NPYV_IMPL_RVV_COMBINE(npyv_s32, s32) +NPYV_IMPL_RVV_COMBINE(npyv_u64, u64) +NPYV_IMPL_RVV_COMBINE(npyv_s64, s64) +NPYV_IMPL_RVV_COMBINE(npyv_f32, f32) +NPYV_IMPL_RVV_COMBINE(npyv_f64, f64) + +// interleave & deinterleave two vectors +NPY_FINLINE npyv_u8 vzip1q_u8(npyv_u8 a, npyv_u8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 interleave_idx = __riscv_vsrl_vx_u8m1(index, 1, vl); + + npyv_u8 a_gathered = __riscv_vrgather_vv_u8m1(a, interleave_idx, vl); + npyv_u8 b_gathered = __riscv_vrgather_vv_u8m1(b, interleave_idx, vl); + + vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( + __riscv_vand_vx_u8m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_u8m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_u8 vzip2q_u8(npyv_u8 a, npyv_u8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 interleave_idx = __riscv_vadd_vx_u8m1( + __riscv_vsrl_vx_u8m1(index, 1, vl), + 8, + vl + ); + + npyv_u8 a_gathered = __riscv_vrgather_vv_u8m1(a, interleave_idx, vl); + npyv_u8 b_gathered = __riscv_vrgather_vv_u8m1(b, interleave_idx, vl); + + vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( + __riscv_vand_vx_u8m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_u8m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_u8 vuzp1q_u8(npyv_u8 a, npyv_u8 b) { + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 gather_idx_a = __riscv_vmul_vx_u8m1(index, 2, vl); + npyv_u8 gather_idx_b = gather_idx_a; + + vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); + + gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); + + npyv_u8 result_a = __riscv_vrgather_vv_u8m1(a, gather_idx_a, vl); + npyv_u8 result_b = __riscv_vrgather_vv_u8m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u8m1(result_a, result_b, high_mask, vl); +} + +NPY_FINLINE npyv_u8 vuzp2q_u8(npyv_u8 a, npyv_u8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 gather_idx_a = __riscv_vadd_vx_u8m1( + __riscv_vmul_vx_u8m1(index, 2, vl), + 1, + vl + ); + + npyv_u8 gather_idx_b = gather_idx_a; + + vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); + + gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); + + npyv_u8 result_a = __riscv_vrgather_vv_u8m1(a, gather_idx_a, vl); + npyv_u8 result_b = __riscv_vrgather_vv_u8m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u8m1(result_a, result_b, high_mask, vl); +} + +NPY_FINLINE npyv_s8 vzip1q_s8(npyv_s8 a, npyv_s8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 interleave_idx = __riscv_vsrl_vx_u8m1(index, 1, vl); + + npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, interleave_idx, vl); + npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, interleave_idx, vl); + + vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( + __riscv_vand_vx_u8m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_s8 vzip2q_s8(npyv_s8 a, npyv_s8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 interleave_idx = __riscv_vadd_vx_u8m1( + __riscv_vsrl_vx_u8m1(index, 1, vl), + 8, + vl + ); + + npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, interleave_idx, vl); + npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, interleave_idx, vl); + + vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( + __riscv_vand_vx_u8m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_s8 vuzp1q_s8(npyv_s8 a, npyv_s8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 gather_idx_a = __riscv_vmul_vx_u8m1(index, 2, vl); + npyv_u8 gather_idx_b = gather_idx_a; + + vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); + + gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); + + npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, gather_idx_a, vl); + npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_s8 vuzp2q_s8(npyv_s8 a, npyv_s8 b) +{ + size_t vl = __riscv_vsetvlmax_e8m1(); + npyv_u8 index = __riscv_vid_v_u8m1(vl); + + npyv_u8 gather_idx_a = __riscv_vadd_vx_u8m1( + __riscv_vmul_vx_u8m1(index, 2, vl), + 1, + vl + ); + + npyv_u8 gather_idx_b = gather_idx_a; + + vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); + + gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); + + npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, gather_idx_a, vl); + npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_u16 vzip1q_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 interleave_idx = __riscv_vsrl_vx_u16m1(index, 1, vl); + + npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, interleave_idx, vl); + npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, interleave_idx, vl); + + vbool16_t mask = __riscv_vmsne_vx_u16m1_b16( + __riscv_vand_vx_u16m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_u16 vzip2q_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 interleave_idx = __riscv_vadd_vx_u16m1( + __riscv_vsrl_vx_u16m1(index, 1, vl), + 4, + vl + ); + + npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, interleave_idx, vl); + npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, interleave_idx, vl); + + vbool16_t mask = __riscv_vmsne_vx_u16m1_b16( + __riscv_vand_vx_u16m1(index, 1, vl), + 0, + vl + ); + + return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, mask, vl); +} + +NPY_FINLINE npyv_u16 vuzp1q_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx_a = __riscv_vmul_vx_u16m1(index, 2, vl); + + npyv_u16 gather_idx_b = __riscv_vmul_vx_u16m1( + __riscv_vsub_vx_u16m1(index, 4, vl), + 2, + vl + ); + + npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); + vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); + + npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, gather_idx_a, vl); + npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_u16 vuzp2q_u16(npyv_u16 a, npyv_u16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx_a = __riscv_vadd_vx_u16m1( + __riscv_vmul_vx_u16m1(index, 2, vl), + 1, + vl + ); + + npyv_u16 gather_idx_b = __riscv_vadd_vx_u16m1( + __riscv_vmul_vx_u16m1( + __riscv_vsub_vx_u16m1(index, 4, vl), + 2, + vl + ), + 1, + vl + ); + + npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); + vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); + + npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, gather_idx_a, vl); + npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_s16 vzip1q_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx = __riscv_vsrl_vx_u16m1(index, 1, vl); + + vbool16_t sel_mask = __riscv_vmsne_vx_u16m1_b16( + __riscv_vand_vx_u16m1(index, 1, vl), + 0, + vl + ); + + npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx, vl); + npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_s16 vzip2q_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx = __riscv_vadd_vx_u16m1( + __riscv_vsrl_vx_u16m1(index, 1, vl), + 4, + vl + ); + + vbool16_t sel_mask = __riscv_vmsne_vx_u16m1_b16( + __riscv_vand_vx_u16m1(index, 1, vl), + 0, + vl + ); + + npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx, vl); + npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_s16 vuzp1q_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx_a = __riscv_vmul_vx_u16m1(index, 2, vl); + + npyv_u16 gather_idx_b = __riscv_vmul_vx_u16m1( + __riscv_vsub_vx_u16m1(index, 4, vl), + 2, + vl + ); + + npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); + vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); + + npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx_a, vl); + npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_s16 vuzp2q_s16(npyv_s16 a, npyv_s16 b) +{ + size_t vl = __riscv_vsetvlmax_e16m1(); + + npyv_u16 index = __riscv_vid_v_u16m1(vl); + + npyv_u16 gather_idx_a = __riscv_vadd_vx_u16m1( + __riscv_vmul_vx_u16m1(index, 2, vl), + 1, + vl + ); + + npyv_u16 gather_idx_b = __riscv_vadd_vx_u16m1( + __riscv_vmul_vx_u16m1( + __riscv_vsub_vx_u16m1(index, 4, vl), + 2, + vl + ), + 1, + vl + ); + + npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); + vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); + + npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx_a, vl); + npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_u32 vzip1q_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); + + vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( + __riscv_vand_vx_u32m1(index, 1, vl), + 0, + vl + ); + + npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx, vl); + npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_u32 vzip2q_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( + __riscv_vsrl_vx_u32m1(index, 1, vl), + 2, + vl + ); + + vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( + __riscv_vand_vx_u32m1(index, 1, vl), + 0, + vl + ); + + npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx, vl); + npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_u32 vuzp1q_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); + + npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx_a, vl); + npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_u32 vuzp2q_u32(npyv_u32 a, npyv_u32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1(index, 2, vl), + 1, + vl + ); + + npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ), + 1, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx_a, vl); + npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_s32 vzip1q_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); + + vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( + __riscv_vand_vx_u32m1(index, 1, vl), + 0, + vl + ); + + npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx, vl); + npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_s32 vzip2q_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( + __riscv_vsrl_vx_u32m1(index, 1, vl), + 2, + vl + ); + + vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( + __riscv_vand_vx_u32m1(index, 1, vl), + 0, + vl + ); + + npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx, vl); + npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_s32 vuzp1q_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); + + npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx_a, vl); + npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_s32 vuzp2q_s32(npyv_s32 a, npyv_s32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1(index, 2, vl), + 1, + vl + ); + + npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ), + 1, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx_a, vl); + npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_f32 vzip1q_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t sel_mask = __riscv_vmsgt_vx_i32m1_b32( + __riscv_vand_vx_i32m1(signed_index, 1, vl), + 0, + vl + ); + + npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx, vl); + npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_f32 vzip2q_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( + __riscv_vsrl_vx_u32m1(index, 1, vl), + 2, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t sel_mask = __riscv_vmsgt_vx_i32m1_b32( + __riscv_vand_vx_i32m1(signed_index, 1, vl), + 0, + vl + ); + + npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx, vl); + npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx, vl); + + return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, sel_mask, vl); +} + +NPY_FINLINE npyv_f32 vuzp1q_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); + + npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx_a, vl); + npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, high_mask, vl); +} + +NPY_FINLINE npyv_f32 vuzp2q_f32(npyv_f32 a, npyv_f32 b) +{ + size_t vl = __riscv_vsetvlmax_e32m1(); + + npyv_u32 index = __riscv_vid_v_u32m1(vl); + + npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1(index, 2, vl), + 1, + vl + ); + + npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( + __riscv_vmul_vx_u32m1( + __riscv_vsub_vx_u32m1(index, 2, vl), + 2, + vl + ), + 1, + vl + ); + + npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); + vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); + + npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx_a, vl); + npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx_b, vl); + + return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, high_mask, vl); +} + +// interleave & deinterleave two vectors +#define NPYV_IMPL_RVV_ZIP(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vzip1q_##SFX(a, b); \ + r.val[1] = vzip2q_##SFX(a, b); \ + return r; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + T_VEC##x2 r; \ + r.val[0] = vuzp1q_##SFX(a, b); \ + r.val[1] = vuzp2q_##SFX(a, b); \ + return r; \ + } + +NPYV_IMPL_RVV_ZIP(npyv_u8, u8) +NPYV_IMPL_RVV_ZIP(npyv_s8, s8) +NPYV_IMPL_RVV_ZIP(npyv_u16, u16) +NPYV_IMPL_RVV_ZIP(npyv_s16, s16) +NPYV_IMPL_RVV_ZIP(npyv_u32, u32) +NPYV_IMPL_RVV_ZIP(npyv_s32, s32) +NPYV_IMPL_RVV_ZIP(npyv_f32, f32) + +#define NPYV_IMPL_RVV_ZIP2(SFX) \ + NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX a, npyv_##SFX b) \ + { \ + npyv_##SFX##x2 r; \ + r.val[0] = npyv_combinel_##SFX(a, b); \ + r.val[1] = npyv_combineh_##SFX(a, b); \ + return r; \ + } \ + \ + NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX a, npyv_##SFX b) \ + { \ + npyv_##SFX##x2 r; \ + r.val[0] = npyv_combinel_##SFX(a, b); \ + r.val[1] = npyv_combineh_##SFX(a, b); \ + return r; \ + } + +NPYV_IMPL_RVV_ZIP2(u64) +NPYV_IMPL_RVV_ZIP2(s64) +NPYV_IMPL_RVV_ZIP2(f64) + +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ + npyv_u8 vid = __riscv_vid_v_u8m1(8); + npyv_u8 vid_slideup = __riscv_vslideup_vx_u8m1(vid, vid, 8, 16); + npyv_u8 sub = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(7, 16), __riscv_vmv_v_x_u8m1(7 + 8, 16), 8, 16); + npyv_u8 idxs = __riscv_vsub_vv_u8m1(sub, vid_slideup, 16); + return __riscv_vrgather_vv_u8m1(a, idxs, 16); +} + +NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a) { + npyv_u8 vid = __riscv_vid_v_u8m1(8); + npyv_u8 vid_slideup = __riscv_vslideup_vx_u8m1(vid, vid, 8, 16); + npyv_u8 sub = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(7,16), __riscv_vmv_v_x_u8m1(7 + 8, 16), 8, 16); + npyv_u8 idxs = __riscv_vsub_vv_u8m1(sub, vid_slideup, 16); + return __riscv_vrgather_vv_i8m1(a, idxs, 16); +} + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) { + npyv_u16 vid = __riscv_vid_v_u16m1(4); + npyv_u16 vid_slideup = __riscv_vslideup_vx_u16m1(vid, vid, 4, 8); + npyv_u16 sub = __riscv_vslideup_vx_u16m1(__riscv_vmv_v_x_u16m1(3, 8), __riscv_vmv_v_x_u16m1(3 + 4, 8), 4, 8); + npyv_u16 idxs = __riscv_vsub_vv_u16m1(sub, vid_slideup, 8); + return __riscv_vrgather_vv_u16m1(a, idxs, 8); +} + +NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a) { + npyv_u16 vid = __riscv_vid_v_u16m1(4); + npyv_u16 vid_slideup = __riscv_vslideup_vx_u16m1(vid, vid, 4, 8); + npyv_u16 sub = __riscv_vslideup_vx_u16m1(__riscv_vmv_v_x_u16m1(3, 8), __riscv_vmv_v_x_u16m1(3 + 4, 8), 4, 8); + npyv_u16 idxs = __riscv_vsub_vv_u16m1(sub, vid_slideup, 8); + return __riscv_vrgather_vv_i16m1(a, idxs, 8); +} + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) { + npyv_u32 vid = __riscv_vid_v_u32m1(2); + npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); + npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); + npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); + return __riscv_vrgather_vv_u32m1(a, idxs, 4); +} + +NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { + npyv_u32 vid = __riscv_vid_v_u32m1(2); + npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); + npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); + npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); + return __riscv_vrgather_vv_i32m1(a, idxs, 4); +} + +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) { + npyv_u32 vid = __riscv_vid_v_u32m1(2); + npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); + npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); + npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); + return __riscv_vrgather_vv_f32m1(a, idxs, 4); +} + +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + npyv_set_u32( \ + __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E0, 4)), __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E1, 4)), \ + __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E2, 4)), __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E3, 4)) \ + ) +#define npyv_permi128_s32(A, E0, E1, E2, E3) \ + npyv_set_s32( \ + __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E0, 4)), __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E1, 4)), \ + __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E2, 4)), __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E3, 4)) \ + ) +#define npyv_permi128_f32(A, E0, E1, E2, E3) \ + npyv_set_f32( \ + __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E0, 4)), __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E1, 4)), \ + __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E2, 4)), __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E3, 4)) \ + ) + +#define npyv_permi128_u64(A, E0, E1) \ + npyv_set_u64( \ + __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(A, E0, 2)), __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(A, E1, 2)) \ + ) +#define npyv_permi128_s64(A, E0, E1) \ + npyv_set_s64( \ + __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(A, E0, 2)), __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(A, E1, 2)) \ + ) +#define npyv_permi128_f64(A, E0, E1) \ + npyv_set_f64( \ + __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(A, E0, 2)), __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(A, E1, 2)) \ + ) + +#endif // _NPY_SIMD_RVV_REORDER_H diff --git a/numpy/_core/src/common/simd/rvv/rvv.h b/numpy/_core/src/common/simd/rvv/rvv.h new file mode 100644 index 000000000000..41a4d8fd42ec --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/rvv.h @@ -0,0 +1,86 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#define NPY_SIMD 128 +#define NPY_SIMD_WIDTH 16 +#define NPY_SIMD_F32 0 +#define NPY_SIMD_F64 1 + +#ifdef NPY_HAVE_FMA3 + #define NPY_SIMD_FMA3 1 // native support +#else + #define NPY_SIMD_FMA3 0 // fast emulated +#endif + +#define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 + +typedef vuint8m1_t fixed_vuint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint16m1_t fixed_vuint16m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint32m1_t fixed_vuint32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint64m1_t fixed_vuint64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint8m1_t fixed_vint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint16m1_t fixed_vint16m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint32m1_t fixed_vint32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint64m1_t fixed_vint64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vfloat32m1_t fixed_vfloat32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vfloat64m1_t fixed_vfloat64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); + +#define npyv_u8 fixed_vuint8m1_t +#define npyv_u16 fixed_vuint16m1_t +#define npyv_u32 fixed_vuint32m1_t +#define npyv_u64 fixed_vuint64m1_t +#define npyv_s8 fixed_vint8m1_t +#define npyv_s16 fixed_vint16m1_t +#define npyv_s32 fixed_vint32m1_t +#define npyv_s64 fixed_vint64m1_t +#define npyv_b8 fixed_vuint8m1_t +#define npyv_b16 fixed_vuint16m1_t +#define npyv_b32 fixed_vuint32m1_t +#define npyv_b64 fixed_vuint64m1_t +#define npyv_f32 fixed_vfloat32m1_t +#define npyv_f64 fixed_vfloat64m1_t + + +typedef struct { fixed_vuint8m1_t val[2]; } npyv_u8x2; +typedef struct { fixed_vint8m1_t val[2]; } npyv_s8x2; +typedef struct { fixed_vuint16m1_t val[2]; } npyv_u16x2; +typedef struct { fixed_vint16m1_t val[2]; } npyv_s16x2; +typedef struct { fixed_vuint32m1_t val[2]; } npyv_u32x2; +typedef struct { fixed_vint32m1_t val[2]; } npyv_s32x2; +typedef struct { fixed_vuint64m1_t val[2]; } npyv_u64x2; +typedef struct { fixed_vint64m1_t val[2]; } npyv_s64x2; +typedef struct { fixed_vfloat32m1_t val[2]; } npyv_f32x2; +typedef struct { fixed_vfloat64m1_t val[2]; } npyv_f64x2; + + +typedef struct { fixed_vuint8m1_t val[3]; } npyv_u8x3; +typedef struct { fixed_vint8m1_t val[3]; } npyv_s8x3; +typedef struct { fixed_vuint16m1_t val[3]; } npyv_u16x3; +typedef struct { fixed_vint16m1_t val[3]; } npyv_s16x3; +typedef struct { fixed_vuint32m1_t val[3]; } npyv_u32x3; +typedef struct { fixed_vint32m1_t val[3]; } npyv_s32x3; +typedef struct { fixed_vuint64m1_t val[3]; } npyv_u64x3; +typedef struct { fixed_vint64m1_t val[3]; } npyv_s64x3; +typedef struct { fixed_vfloat32m1_t val[3]; } npyv_f32x3; +typedef struct { fixed_vfloat64m1_t val[3]; } npyv_f64x3; + +#define npyv_nlanes_u8 16 +#define npyv_nlanes_s8 16 +#define npyv_nlanes_u16 8 +#define npyv_nlanes_s16 8 +#define npyv_nlanes_u32 4 +#define npyv_nlanes_s32 4 +#define npyv_nlanes_u64 2 +#define npyv_nlanes_s64 2 +#define npyv_nlanes_f32 4 +#define npyv_nlanes_f64 2 + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" +#include "math.h" diff --git a/numpy/_core/src/common/simd/simd.h b/numpy/_core/src/common/simd/simd.h index fe4ca4da92f5..0bb313bb7300 100644 --- a/numpy/_core/src/common/simd/simd.h +++ b/numpy/_core/src/common/simd/simd.h @@ -91,6 +91,10 @@ typedef double npyv_lanetype_f64; #include "lsx/lsx.h" #endif +#ifdef NPY_HAVE_RVV + #include "rvv/rvv.h" +#endif + #ifndef NPY_SIMD /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src index d330c21695d5..f8b44ee41909 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src @@ -36,7 +36,7 @@ * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ -#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) +#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV) // Due to integer 128-bit multiplication emulation, SIMD 64-bit division // may not perform well on both neon and up to VSX3 compared to scalar // division. @@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. * Power10(VSX4) is an exception here since it has native support for integer vector division. */ -#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)) +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV)) #undef TO_SIMD_SFX #endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) diff --git a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src index e12a98864d96..885ee13c7b86 100755 --- a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src +++ b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src @@ -152,11 +152,19 @@ store_vector(vtype vec, type_t* dst, npy_intp sdst, npy_intp len){ #if NPY_SIMD_F64 [[maybe_unused]] HWY_ATTR NPY_FINLINE vec_f64 lut_16_f64(const double * lut, vec_u64 idx){ +#if defined(NPY_HAVE_RVV) + if (hn::MaxLanes(f64) == 8){ +#else if constexpr(hn::MaxLanes(f64) == 8){ +#endif const vec_f64 lut0 = hn::Load(f64, lut); const vec_f64 lut1 = hn::Load(f64, lut + 8); return hn::TwoTablesLookupLanes(f64, lut0, lut1, hn::IndicesFromVec(f64, idx)); +#if defined(NPY_HAVE_RVV) + }else if (hn::MaxLanes(f64) == 4){ +#else }else if constexpr (hn::MaxLanes(f64) == 4){ +#endif const vec_f64 lut0 = hn::Load(f64, lut); const vec_f64 lut1 = hn::Load(f64, lut + 4); const vec_f64 lut2 = hn::Load(f64, lut + 8); @@ -371,6 +379,9 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ const int nlanes = hn::Lanes(f64); const vec_f64 qnan = hn::Set(f64, NPY_NAN); +#if defined(NPY_HAVE_RVV) + vec_f64 vec0,vec1; +#endif for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) { vec_f64 x = load_vector(src, ssrc, len); @@ -391,6 +402,22 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ // transpose the coef. into lanes. 2 lane transpose is all that's // implemented so we require `npyv_nlanes_f64` == 2. vec_f64 b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16; +#if defined(NPY_HAVE_RVV) + if (hn::MaxLanes(f64) == 2){ + uint64_t index[2]; + hn::StoreU(idx, u64, index); + + /**begin repeat + * #off= 0, 2, 4, 6, 8, 10, 12, 14, 16# + * #e0 = b, c1, c3, c5, c7, c9, c11, c13, c15# + * #e1 = c0,c2, c4, c6, c8, c10,c12, c14, c16# + */ + vec0 = hn::LoadU(f64, (const double*)lut18x16 + index[0] * 18 + @off@); + vec1 = hn::LoadU(f64, (const double*)lut18x16 + index[1] * 18 + @off@); + @e0@ = hn::ConcatLowerLower(f64, vec1, vec0); + @e1@ = hn::ConcatUpperUpper(f64, vec1, vec0); + /**end repeat**/ +#else if constexpr(hn::MaxLanes(f64) == 2){ vec_f64 e0e1_0, e0e1_1; uint64_t index[hn::Lanes(f64)]; @@ -406,6 +433,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ @e0@ = hn::ConcatLowerLower(f64, e0e1_1, e0e1_0); @e1@ = hn::ConcatUpperUpper(f64, e0e1_1, e0e1_0); /**end repeat**/ +#endif } else { b = lut_16_f64((const double*)lut16x18 + 16*0, idx); c0 = lut_16_f64((const double*)lut16x18 + 1*16, idx); diff --git a/numpy/_core/tests/test_cpu_dispatcher.py b/numpy/_core/tests/test_cpu_dispatcher.py index c52cd418a08b..38ff010c5144 100644 --- a/numpy/_core/tests/test_cpu_dispatcher.py +++ b/numpy/_core/tests/test_cpu_dispatcher.py @@ -12,7 +12,7 @@ def test_dispatcher(): "SSE2", "SSE41", "AVX2", "VSX", "VSX2", "VSX3", "NEON", "ASIMD", "ASIMDHP", - "VX", "VXE", "LSX" + "VX", "VXE", "LSX", "RVV" ) highest_sfx = "" # no suffix for the baseline all_sfx = [] diff --git a/numpy/_core/tests/test_cpu_features.py b/numpy/_core/tests/test_cpu_features.py index 0797bb8c772c..722000561a78 100644 --- a/numpy/_core/tests/test_cpu_features.py +++ b/numpy/_core/tests/test_cpu_features.py @@ -432,3 +432,12 @@ class Test_LOONGARCH_Features(AbstractTest): def load_flags(self): self.load_flags_cpuinfo("Features") + + +is_riscv = re.match("^(riscv64)", machine, re.IGNORECASE) +@pytest.mark.skipif(not is_linux or not is_riscv, reason="Only for Linux and RISC-V") +class Test_RISCV_Features(AbstractTest): + features = ["RVV"] + + def load_flags(self): + self.load_flags_cpuinfo("Features") \ No newline at end of file From cea06b42066bae9a06d85b105e950c5cb5e26fe1 Mon Sep 17 00:00:00 2001 From: amane-ame Date: Fri, 11 Apr 2025 12:46:31 +0800 Subject: [PATCH 2/3] Rewrite rvv backend. --- meson.options | 2 +- numpy/_core/meson.build | 3 +- numpy/_core/src/common/simd/rvv/arithmetic.h | 438 +++---- numpy/_core/src/common/simd/rvv/conversion.h | 164 ++- numpy/_core/src/common/simd/rvv/math.h | 634 +++-------- numpy/_core/src/common/simd/rvv/memory.h | 751 ++++-------- numpy/_core/src/common/simd/rvv/misc.h | 780 ++++--------- numpy/_core/src/common/simd/rvv/operators.h | 936 ++++----------- numpy/_core/src/common/simd/rvv/reorder.h | 1002 +++-------------- numpy/_core/src/common/simd/rvv/rvv.h | 50 +- .../umath/loops_unary_fp_le.dispatch.c.src | 22 +- numpy/_core/tests/test_cpu_features.py | 9 - 12 files changed, 1154 insertions(+), 3637 deletions(-) diff --git a/meson.options b/meson.options index 7b86854e9882..4989828d22e1 100644 --- a/meson.options +++ b/meson.options @@ -35,7 +35,7 @@ option('test-simd', type: 'array', 'VSX', 'VSX2', 'VSX3', 'VSX4', 'NEON', 'ASIMD', 'VX', 'VXE', 'VXE2', - 'LSX','RVV', + 'LSX', 'RVV', ], description: 'Specify a list of CPU features to be tested against NumPy SIMD interface') option('test-simd-args', type: 'string', value: '', diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index 7b74abcfa578..229bc5cfce47 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -104,7 +104,7 @@ if host_machine.cpu_family() == 'loongarch64' endif if host_machine.cpu_family() == 'riscv64' - add_project_arguments('-march=rv64gcv_zvl128b', '-mrvv-vector-bits=zvl', language: ['c','cpp']) + add_project_arguments('-march=rv64gcv_zvl256b', '-mrvv-vector-bits=256', language: ['c','cpp']) endif use_highway = not get_option('disable-highway') @@ -946,6 +946,7 @@ foreach gen_mtargets : [ NEON_VFPV4, VXE, LSX, + RVV, ] ], [ diff --git a/numpy/_core/src/common/simd/rvv/arithmetic.h b/numpy/_core/src/common/simd/rvv/arithmetic.h index a301bc7b258e..ca924e18fedd 100644 --- a/numpy/_core/src/common/simd/rvv/arithmetic.h +++ b/numpy/_core/src/common/simd/rvv/arithmetic.h @@ -5,406 +5,222 @@ #ifndef _NPY_SIMD_RVV_ARITHMETIC_H #define _NPY_SIMD_RVV_ARITHMETIC_H -#include - /*************************** * Addition ***************************/ // non-saturated -NPY_FINLINE npyv_u8 npyv_add_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vadd_vv_u8m1(a, b, 16); } -NPY_FINLINE npyv_s8 npyv_add_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vadd_vv_i8m1(a, b, 16); } -NPY_FINLINE npyv_u16 npyv_add_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vadd_vv_u16m1(a, b, 8); } -NPY_FINLINE npyv_s16 npyv_add_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vadd_vv_i16m1(a, b, 8); } -NPY_FINLINE npyv_u32 npyv_add_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vadd_vv_u32m1(a, b, 4); } -NPY_FINLINE npyv_s32 npyv_add_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vadd_vv_i32m1(a, b, 4); } -NPY_FINLINE npyv_u64 npyv_add_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vadd_vv_u64m1(a, b, 2); } -NPY_FINLINE npyv_s64 npyv_add_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vadd_vv_i64m1(a, b, 2); } -NPY_FINLINE npyv_f32 npyv_add_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfadd_vv_f32m1(a, b, 4); } -NPY_FINLINE npyv_f64 npyv_add_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfadd_vv_f64m1(a, b, 2); } +NPY_FINLINE npyv_u8 npyv_add_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vadd_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_add_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vadd_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_add_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vadd_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_add_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vadd_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_add_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vadd_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_add_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vadd_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_add_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vadd_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_add_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vadd_vv_i64m1(a, b, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_add_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfadd_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_add_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfadd_vv_f64m1(a, b, npyv_nlanes_f64); } // saturated -NPY_FINLINE npyv_u8 npyv_adds_u8(npyv_u8 a, npyv_u8 b) { - return __riscv_vsaddu_vv_u8m1(a, b, 16); -} - -NPY_FINLINE npyv_s8 npyv_adds_s8(npyv_s8 a, npyv_s8 b) { - return __riscv_vsadd_vv_i8m1(a, b, 16); -} - -NPY_FINLINE npyv_u16 npyv_adds_u16(npyv_u16 a, npyv_u16 b) { - return __riscv_vsaddu_vv_u16m1(a, b, 8); -} - -NPY_FINLINE npyv_s16 npyv_adds_s16(npyv_s16 a, npyv_s16 b) { - return __riscv_vsadd_vv_i16m1(a, b, 8); -} +NPY_FINLINE npyv_u8 npyv_adds_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsaddu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_adds_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsadd_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_adds_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsaddu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_adds_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsadd_vv_i16m1(a, b, npyv_nlanes_s16); } /*************************** * Subtraction ***************************/ // non-saturated -NPY_FINLINE npyv_u8 npyv_sub_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsub_vv_u8m1(a, b, 16); } -NPY_FINLINE npyv_s8 npyv_sub_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsub_vv_i8m1(a, b, 16); } -NPY_FINLINE npyv_u16 npyv_sub_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsub_vv_u16m1(a, b, 8); } -NPY_FINLINE npyv_s16 npyv_sub_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsub_vv_i16m1(a, b, 8); } -NPY_FINLINE npyv_u32 npyv_sub_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vsub_vv_u32m1(a, b, 4); } -NPY_FINLINE npyv_s32 npyv_sub_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vsub_vv_i32m1(a, b, 4); } -NPY_FINLINE npyv_u64 npyv_sub_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vsub_vv_u64m1(a, b, 2); } -NPY_FINLINE npyv_s64 npyv_sub_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vsub_vv_i64m1(a, b, 2); } -NPY_FINLINE npyv_f32 npyv_sub_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfsub_vv_f32m1(a, b, 4); } -NPY_FINLINE npyv_f64 npyv_sub_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfsub_vv_f64m1(a, b, 2); } +NPY_FINLINE npyv_u8 npyv_sub_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsub_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_sub_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsub_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_sub_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsub_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_sub_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsub_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_sub_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vsub_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_sub_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vsub_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_sub_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vsub_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_sub_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vsub_vv_i64m1(a, b, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_sub_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfsub_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_sub_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfsub_vv_f64m1(a, b, npyv_nlanes_f64); } // saturated -NPY_FINLINE npyv_u8 npyv_subs_u8(npyv_u8 a, npyv_u8 b) { - return __riscv_vssubu_vv_u8m1(a, b, 16); -} - -NPY_FINLINE npyv_s8 npyv_subs_s8(npyv_s8 a, npyv_s8 b) { - return __riscv_vssub_vv_i8m1(a, b, 16); -} - -NPY_FINLINE npyv_u16 npyv_subs_u16(npyv_u16 a, npyv_u16 b) { - return __riscv_vssubu_vv_u16m1(a, b, 8); -} - -NPY_FINLINE npyv_s16 npyv_subs_s16(npyv_s16 a, npyv_s16 b) { - return __riscv_vssub_vv_i16m1(a, b, 8); -} +NPY_FINLINE npyv_u8 npyv_subs_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vssubu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_subs_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vssub_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_subs_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vssubu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_subs_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vssub_vv_i16m1(a, b, npyv_nlanes_s16); } /*************************** * Multiplication ***************************/ // non-saturated -NPY_FINLINE npyv_u8 npyv_mul_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vmul_vv_u8m1(a, b, 16); } -NPY_FINLINE npyv_s8 npyv_mul_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vmul_vv_i8m1(a, b, 16); } -NPY_FINLINE npyv_u16 npyv_mul_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vmul_vv_u16m1(a, b, 8); } -NPY_FINLINE npyv_s16 npyv_mul_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vmul_vv_i16m1(a, b, 8); } -NPY_FINLINE npyv_u32 npyv_mul_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vmul_vv_u32m1(a, b, 4); } -NPY_FINLINE npyv_s32 npyv_mul_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vmul_vv_i32m1(a, b, 4); } -NPY_FINLINE npyv_f32 npyv_mul_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfmul_vv_f32m1(a, b, 4); } -NPY_FINLINE npyv_f64 npyv_mul_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfmul_vv_f64m1(a, b, 2); } +NPY_FINLINE npyv_u8 npyv_mul_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vmul_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_mul_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vmul_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_mul_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vmul_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_mul_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vmul_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_mul_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vmul_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_mul_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vmul_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_f32 npyv_mul_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfmul_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_mul_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfmul_vv_f64m1(a, b, npyv_nlanes_f64); } /*************************** * Integer Division ***************************/ // See simd/intdiv.h for more clarification // divide each unsigned 8-bit element by a precomputed divisor -NPY_FINLINE npyv_s16 vmull_high_s8(npyv_s8 a, npyv_s8 b) { - vint8mf2_t a_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(a, 8, 16)); - vint8mf2_t b_high = __riscv_vlmul_trunc_v_i8m1_i8mf2(__riscv_vslidedown_vx_i8m1(b, 8, 16)); - return __riscv_vwmul_vv_i16m1(a_high, b_high, 8); -} - -NPY_FINLINE npyv_s32 vmull_high_s16(npyv_s16 a, npyv_s16 b) { - vint16mf2_t a_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(a, 4, 8)); - vint16mf2_t b_high = __riscv_vlmul_trunc_v_i16m1_i16mf2(__riscv_vslidedown_vx_i16m1(b, 4, 8)); - return __riscv_vwmul_vv_i32m1(a_high, b_high, 4); -} - -NPY_FINLINE npyv_s64 vmull_high_s32(npyv_s32 a, npyv_s32 b) { - vint32mf2_t a_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(a, 2, 4)); - vint32mf2_t b_high = __riscv_vlmul_trunc_v_i32m1_i32mf2(__riscv_vslidedown_vx_i32m1(b, 2, 4)); - return __riscv_vwmul_vv_i64m1(a_high, b_high, 2); -} - -NPY_FINLINE npyv_u16 vmull_high_u8(npyv_u8 a, npyv_u8 b) { - vuint8mf2_t a_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(a, 8, 16)); - vuint8mf2_t b_high = __riscv_vlmul_trunc_v_u8m1_u8mf2(__riscv_vslidedown_vx_u8m1(b, 8, 16)); - return __riscv_vwmulu_vv_u16m1(a_high, b_high, 8); -} - -NPY_FINLINE npyv_u32 vmull_high_u16(npyv_u16 a, npyv_u16 b) { - vuint16mf2_t a_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(a, 4, 8)); - vuint16mf2_t b_high = __riscv_vlmul_trunc_v_u16m1_u16mf2(__riscv_vslidedown_vx_u16m1(b, 4, 8)); - return __riscv_vwmulu_vv_u32m1(a_high, b_high, 4); -} - -NPY_FINLINE npyv_u64 vmull_high_u32(npyv_u32 a, npyv_u32 b) { - vuint32mf2_t a_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(a, 2, 4)); - vuint32mf2_t b_high = __riscv_vlmul_trunc_v_u32m1_u32mf2(__riscv_vslidedown_vx_u32m1(b, 2, 4)); - return __riscv_vwmulu_vv_u64m1(a_high, b_high, 2); -} - -NPY_FINLINE npyv_u8 vshlq_u8(npyv_u8 a, npyv_s8 b) { - // implementation only works within defined range 'b' in [0, 7] - vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 16); - npyv_u8 shl = __riscv_vsll_vv_u8m1(a, __riscv_vreinterpret_v_i8m1_u8m1(b), 16); - vuint16m2_t a_ext = __riscv_vzext_vf2_u16m2(a, 16); - npyv_s8 b_neg = __riscv_vneg_v_i8m1(b, 16); - npyv_u8 shr = __riscv_vnclipu_wv_u8m1(a_ext, __riscv_vreinterpret_v_i8m1_u8m1(b_neg), __RISCV_VXRM_RDN, 16); - return __riscv_vmerge_vvm_u8m1(shr, shl, positive_mask, 16); -} - NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) { - const npyv_u8 mulc_lo = divisor.val[0]; // high part of unsigned multiplication - npyv_u16 mull_lo = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vwmulu_vv_u16m2(a, mulc_lo, 8)); - npyv_u16 mull_hi = vmull_high_u8(a, divisor.val[0]); - // get the high unsigned bytes - npyv_u8 mulhi = vuzp2q_u8(__riscv_vreinterpret_v_u16m1_u8m1(mull_lo), __riscv_vreinterpret_v_u16m1_u8m1(mull_hi)); - - // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 - npyv_u8 q = __riscv_vsub_vv_u8m1(a, mulhi, 16); - q = vshlq_u8(q, __riscv_vreinterpret_v_u8m1_i8m1(divisor.val[1])); - q = __riscv_vadd_vv_u8m1(mulhi, q, 16); - q = vshlq_u8(q, __riscv_vreinterpret_v_u8m1_i8m1(divisor.val[2])); - return q; -} - -NPY_FINLINE npyv_s8 vshlq_s8(npyv_s8 a, npyv_s8 b) { - // implementation only works within defined range 'b' in [0, 7] - vbool8_t positive_mask = __riscv_vmsgt_vx_i8m1_b8(b, 0, 16); - npyv_s8 shl = __riscv_vsll_vv_i8m1(a, __riscv_vreinterpret_v_i8m1_u8m1(b), 16); - vint16m2_t a_ext = __riscv_vsext_vf2_i16m2(a, 16); - npyv_s8 b_neg = __riscv_vneg_v_i8m1(b, 16); - npyv_s8 shr = __riscv_vnclip_wv_i8m1(a_ext, __riscv_vreinterpret_v_i8m1_u8m1(b_neg), __RISCV_VXRM_RDN, 16); - return __riscv_vmerge_vvm_i8m1(shr, shl, positive_mask, 16); -} + vuint8m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u8); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint8m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u8); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u8); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u8); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u8); -NPY_FINLINE npyv_s8 vshrq_n_s8(npyv_s8 a, const int b) { - const int imm = b - (b >> 3); - return __riscv_vsra_vx_i8m1(a, imm, 16); + return q; } - // divide each signed 8-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) { - const npyv_s8 mulc_lo = divisor.val[0]; - // high part of signed multiplication - npyv_s16 mull_lo = __riscv_vlmul_trunc_v_i16m2_i16m1(__riscv_vwmul_vv_i16m2(a, mulc_lo, 8)); - npyv_s16 mull_hi = vmull_high_s8(a, divisor.val[0]); - // get the high unsigned bytes - npyv_s8 mulhi = vuzp2q_s8(__riscv_vreinterpret_v_i16m1_i8m1(mull_lo), __riscv_vreinterpret_v_i16m1_i8m1(mull_hi)); - // q = ((a + mulhi) >> sh1) - XSIGN(a) - // trunc(a/d) = (q ^ dsign) - dsign - npyv_s8 q = vshlq_s8(__riscv_vadd_vv_i8m1(a, mulhi, 16), divisor.val[1]); - q = __riscv_vsub_vv_i8m1(q, vshrq_n_s8(a, 7), 16); - q = __riscv_vsub_vv_i8m1(__riscv_vxor_vv_i8m1(q, divisor.val[2], 16), divisor.val[2], 16); - return q; + vint8m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s8); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint8m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s8), __riscv_vreinterpret_v_i8m1_u8m1(divisor.val[1]), npyv_nlanes_s8); + q = __riscv_vsub(q, __riscv_vsra(a, 7, npyv_nlanes_s8), npyv_nlanes_s8); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s8), divisor.val[2], npyv_nlanes_s8); + return q; } - -NPY_FINLINE npyv_u16 vshlq_u16(npyv_u16 a, npyv_s16 b) { - // implementation only works within defined range 'b' in [0, 15] - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); -} - // divide each unsigned 16-bit element by a precomputed divisor NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) { - const npyv_u16 mulc_lo = divisor.val[0]; // high part of unsigned multiplication - npyv_u32 mull_lo = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vwmulu_vv_u32m2(a, mulc_lo, 4)); - npyv_u32 mull_hi = vmull_high_u16(a, divisor.val[0]); - // get the high unsigned bytes - npyv_u16 mulhi = vuzp2q_u16(__riscv_vreinterpret_v_u32m1_u16m1(mull_lo), __riscv_vreinterpret_v_u32m1_u16m1(mull_hi)); - - // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 - npyv_u16 q = __riscv_vsub_vv_u16m1(a, mulhi, 8); - q = vshlq_u16(q, __riscv_vreinterpret_v_u16m1_i16m1(divisor.val[1])); - q = __riscv_vadd_vv_u16m1(mulhi, q, 8); - q = vshlq_u16(q, __riscv_vreinterpret_v_u16m1_i16m1(divisor.val[2])); - return q; -} - -NPY_FINLINE npyv_s16 vshrq_n_s16(npyv_s16 a, const int b) { - const int imm = b - (b >> 4); - return __riscv_vsra_vx_i16m1(a, imm, 8); -} - -NPY_FINLINE npyv_s16 vshlq_s16(npyv_s16 a, npyv_s16 b) { - // implementation only works within defined range 'b' in [0, 15] - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); + vuint16m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u16); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint16m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u16); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u16); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u16); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u16); + return q; } - // divide each signed 16-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) { - const npyv_s16 mulc_lo = divisor.val[0]; // high part of signed multiplication - npyv_s32 mull_lo = __riscv_vlmul_trunc_v_i32m2_i32m1(__riscv_vwmul_vv_i32m2(a, mulc_lo, 4)); - npyv_s32 mull_hi = vmull_high_s16(a, divisor.val[0]); - // get the high unsigned bytes - npyv_s16 mulhi = vuzp2q_s16(__riscv_vreinterpret_v_i32m1_i16m1(mull_lo), __riscv_vreinterpret_v_i32m1_i16m1(mull_hi)); - // q = ((a + mulhi) >> sh1) - XSIGN(a) - // trunc(a/d) = (q ^ dsign) - dsign - npyv_s16 q = vshlq_s16(__riscv_vadd_vv_i16m1(a, mulhi, 8), divisor.val[1]); - q = __riscv_vsub_vv_i16m1(q, vshrq_n_s16(a, 15), 8); - q = __riscv_vsub_vv_i16m1(__riscv_vxor_vv_i16m1(q, divisor.val[2], 8), divisor.val[2], 8); - return q; -} - -NPY_FINLINE npyv_u32 vshlq_u32(npyv_u32 a, npyv_s32 b) { - // implementation only works within defined range 'b' in [0, 31] - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); -} - + vint16m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s16); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint16m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s16), __riscv_vreinterpret_v_i16m1_u16m1(divisor.val[1]), npyv_nlanes_s16); + q = __riscv_vsub(q, __riscv_vsra(a, 15, npyv_nlanes_s16), npyv_nlanes_s16); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s16), divisor.val[2], npyv_nlanes_s16); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) { - const npyv_u32 mulc_lo = divisor.val[0]; // high part of unsigned multiplication - npyv_u64 mull_lo = __riscv_vlmul_trunc_v_u64m2_u64m1(__riscv_vwmulu_vv_u64m2(a, mulc_lo, 2)); - npyv_u64 mull_hi = vmull_high_u32(a, divisor.val[0]); - // get the high unsigned bytes - npyv_u32 mulhi = vuzp2q_u32(__riscv_vreinterpret_v_u64m1_u32m1(mull_lo), __riscv_vreinterpret_v_u64m1_u32m1(mull_hi)); + vuint32m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u32); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint32m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u32); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u32); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u32); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u32); - // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 - npyv_u32 q = __riscv_vsub_vv_u32m1(a, mulhi, 4); - q = vshlq_u32(q, __riscv_vreinterpret_v_u32m1_i32m1(divisor.val[1])); - q = __riscv_vadd_vv_u32m1(mulhi, q, 4); - q = vshlq_u32(q, __riscv_vreinterpret_v_u32m1_i32m1(divisor.val[2])); - return q; + return q; } - -NPY_FINLINE npyv_s32 vshrq_n_s32(npyv_s32 a, const int b) { - const int imm = b - (b >> 5); - return __riscv_vsra_vx_i32m1(a, imm, 4); -} - -NPY_FINLINE npyv_s32 vshlq_s32(npyv_s32 a, npyv_s32 b) { - // implementation only works within defined range 'b' in [0, 31] - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); -} - // divide each signed 32-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) { - const npyv_s32 mulc_lo = divisor.val[0]; // high part of signed multiplication - npyv_s64 mull_lo = __riscv_vlmul_trunc_v_i64m2_i64m1(__riscv_vwmul_vv_i64m2(a, mulc_lo, 2)); - npyv_s64 mull_hi = vmull_high_s32(a, divisor.val[0]); - // get the high unsigned bytes - npyv_s32 mulhi = vuzp2q_s32(__riscv_vreinterpret_v_i64m1_i32m1(mull_lo), __riscv_vreinterpret_v_i64m1_i32m1(mull_hi)); - // q = ((a + mulhi) >> sh1) - XSIGN(a) - // trunc(a/d) = (q ^ dsign) - dsign - npyv_s32 q = vshlq_s32(__riscv_vadd_vv_i32m1(a, mulhi, 4), divisor.val[1]); - q = __riscv_vsub_vv_i32m1(q, vshrq_n_s32(a, 31), 4); - q = __riscv_vsub_vv_i32m1(__riscv_vxor_vv_i32m1(q, divisor.val[2], 4), divisor.val[2], 4); - return q; -} - -NPY_FINLINE uint64_t vgetq_lane_u64(npyv_u64 a, const int b) { - return __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, b, 2)); -} - -NPY_FINLINE int64_t vgetq_lane_s64(npyv_s64 a, const int b) { - return __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(a, b, 2)); -} - -// divide each unsigned 64-bit element by a divisor + vint32m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s32); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint32m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s32), __riscv_vreinterpret_v_i32m1_u32m1(divisor.val[1]), npyv_nlanes_s32); + q = __riscv_vsub(q, __riscv_vsra(a, 31, npyv_nlanes_s32), npyv_nlanes_s32); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s32), divisor.val[2], npyv_nlanes_s32); + return q; +} +// divide each unsigned 64-bit element by a precomputed divisor NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) { - const uint64_t d = vgetq_lane_u64(divisor.val[0], 0); - return npyv_set_u64(vgetq_lane_u64(a, 0) / d, vgetq_lane_u64(a, 1) / d); -} + // high part of unsigned multiplication + vuint64m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u64); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint64m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u64); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u64); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u64); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u64); -// returns the high 64 bits of signed 64-bit multiplication + return q; +} +// divide each signed 64-bit element by a precomputed divisor (round towards zero) NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) { - const int64_t d = vgetq_lane_s64(divisor.val[0], 0); - return npyv_set_s64(vgetq_lane_s64(a, 0) / d, vgetq_lane_s64(a, 1) / d); + // high part of signed multiplication + vint64m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s64); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint64m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s64), __riscv_vreinterpret_v_i64m1_u64m1(divisor.val[1]), npyv_nlanes_s64); + q = __riscv_vsub(q, __riscv_vsra(a, 63, npyv_nlanes_s64), npyv_nlanes_s64); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s64), divisor.val[2], npyv_nlanes_s64); + return q; } /*************************** * Division ***************************/ -NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfdiv_vv_f32m1(a, b, 4); } -NPY_FINLINE npyv_f64 npyv_div_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfdiv_vv_f64m1(a, b, 2); } +NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfdiv_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_div_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfdiv_vv_f64m1(a, b, npyv_nlanes_f64); } /*************************** * FUSED F32 ***************************/ // multiply and add, a*b + c NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) -{ return __riscv_vfmacc_vv_f32m1(c, a, b, 4); } - +{ return __riscv_vfmadd_vv_f32m1(a, b, c, npyv_nlanes_f32); } // multiply and subtract, a*b - c NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) -{ return __riscv_vfmacc_vv_f32m1(__riscv_vfneg_v_f32m1(c, 4), a, b, 4); } - +{ return __riscv_vfmsub_vv_f32m1(a, b, c, npyv_nlanes_f32); } // negate multiply and add, -(a*b) + c NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) -{ return __riscv_vfnmsac_vv_f32m1(c, a, b, 4); } - +{ return __riscv_vfnmsub_vv_f32m1(a, b, c, npyv_nlanes_f32); } // negate multiply and subtract, -(a*b) - c NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) -{ return __riscv_vfnmsac_vv_f32m1(__riscv_vfneg_v_f32m1(c, 4), a, b, 4); } +{ return __riscv_vfnmadd_vv_f32m1(a, b, c, npyv_nlanes_f32); } // multiply, add for odd elements and subtract even elements. // (a * b) -+ c NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) -{ - const npyv_f32 msign = npyv_set_f32(-0.0f, 0.0f, -0.0f, 0.0f); - return npyv_muladd_f32(a, b, npyv_xor_f32(msign, c)); -} +{ return npyv_muladd_f32(a, b, __riscv_vfneg_v_f32m1_mu(__riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1(0x55, npyv_nlanes_u8)), c, c, npyv_nlanes_f32)); } /*************************** * FUSED F64 ***************************/ NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) -{ return __riscv_vfmacc_vv_f64m1(c, a, b, 2); } - +{ return __riscv_vfmadd_vv_f64m1(a, b, c, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) -{ return __riscv_vfmacc_vv_f64m1(__riscv_vfneg_v_f64m1(c, 2), a, b, 2); } - +{ return __riscv_vfmsub_vv_f64m1(a, b, c, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) -{ return __riscv_vfnmsac_vv_f64m1(c, a, b, 2); } - +{ return __riscv_vfnmsub_vv_f64m1(a, b, c, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) -{ return __riscv_vfnmsac_vv_f64m1(__riscv_vfneg_v_f64m1(c, 2), a, b, 2); } +{ return __riscv_vfnmadd_vv_f64m1(a, b, c, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) -{ - const npyv_f64 msign = npyv_set_f64(-0.0, 0.0); - return npyv_muladd_f64(a, b, npyv_xor_f64(msign, c)); -} +{ return npyv_muladd_f64(a, b, __riscv_vfneg_v_f64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0x55, npyv_nlanes_u8)), c, c, npyv_nlanes_f64)); } /*************************** * Summation ***************************/ // reduce sum across vector -NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) { - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)); -} - -NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) { - return __riscv_vmv_x_s_u64m1_u64(__riscv_vredsum_vs_u64m1_u64m1(a, __riscv_vmv_v_x_u64m1(0, 2), 2)); -} - -NPY_FINLINE float npyv_sum_f32(npyv_f32 a) { - return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredosum_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(0, 4), 4)); -} - -NPY_FINLINE double npyv_sum_f64(npyv_f64 a) { - return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredosum_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(0, 2), 2)); -} - -NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) { - return __riscv_vmv_x_s_u16m1_u16(__riscv_vwredsumu_vs_u8m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 16)); -} - -NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) { - return __riscv_vmv_x_s_u32m1_u32(__riscv_vwredsumu_vs_u16m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 8)); -} - -#endif // _NPY_SIMD_RVV_ARITHMETIC_H \ No newline at end of file +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) +{ return __riscv_vmv_x(__riscv_vredsum(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u32)); } +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ return __riscv_vmv_x(__riscv_vredsum(a, __riscv_vmv_s_x_u64m1(0, 1), npyv_nlanes_u64)); } +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ return __riscv_vfmv_f(__riscv_vfredosum(a, __riscv_vfmv_s_f_f32m1(0, 1), npyv_nlanes_f32)); } +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ return __riscv_vfmv_f(__riscv_vfredosum(a, __riscv_vfmv_s_f_f64m1(0, 1), npyv_nlanes_f64)); } + +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ return __riscv_vmv_x(__riscv_vwredsumu(a, __riscv_vmv_s_x_u16m1(0, 1), npyv_nlanes_u8)); } +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ return __riscv_vmv_x(__riscv_vwredsumu(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u16)); } + +#endif // _NPY_SIMD_RVV_ARITHMETIC_H diff --git a/numpy/_core/src/common/simd/rvv/conversion.h b/numpy/_core/src/common/simd/rvv/conversion.h index 1e937920c4e7..6399dca4b5d8 100644 --- a/numpy/_core/src/common/simd/rvv/conversion.h +++ b/numpy/_core/src/common/simd/rvv/conversion.h @@ -6,136 +6,112 @@ #define _NPY_SIMD_RVV_CVT_H #define npyv_cvt_u8_b8(A) A -#define npyv_cvt_s8_b8(A) __riscv_vreinterpret_v_u8m1_i8m1(A) #define npyv_cvt_u16_b16(A) A -#define npyv_cvt_s16_b16(A) __riscv_vreinterpret_v_u16m1_i16m1(A) #define npyv_cvt_u32_b32(A) A -#define npyv_cvt_s32_b32(A) __riscv_vreinterpret_v_u32m1_i32m1(A) #define npyv_cvt_u64_b64(A) A -#define npyv_cvt_s64_b64(A) __riscv_vreinterpret_v_u64m1_i64m1(A) -#define npyv_cvt_f32_b32(A) __riscv_vreinterpret_v_u32m1_f32m1(A) -#define npyv_cvt_f64_b64(A) __riscv_vreinterpret_v_u64m1_f64m1(A) +#define npyv_cvt_s8_b8(A) __riscv_vreinterpret_v_u8m1_i8m1(npyv_cvt_u8_b8(A)) +#define npyv_cvt_s16_b16(A) __riscv_vreinterpret_v_u16m1_i16m1(npyv_cvt_u16_b16(A)) +#define npyv_cvt_s32_b32(A) __riscv_vreinterpret_v_u32m1_i32m1(npyv_cvt_u32_b32(A)) +#define npyv_cvt_s64_b64(A) __riscv_vreinterpret_v_u64m1_i64m1(npyv_cvt_u64_b64(A)) +#define npyv_cvt_f32_b32(A) __riscv_vreinterpret_v_u32m1_f32m1(npyv_cvt_u32_b32(A)) +#define npyv_cvt_f64_b64(A) __riscv_vreinterpret_v_u64m1_f64m1(npyv_cvt_u64_b64(A)) #define npyv_cvt_b8_u8(A) A -#define npyv_cvt_b8_s8(A) __riscv_vreinterpret_v_i8m1_u8m1(A) #define npyv_cvt_b16_u16(A) A -#define npyv_cvt_b16_s16(A) __riscv_vreinterpret_v_i16m1_u16m1(A) #define npyv_cvt_b32_u32(A) A -#define npyv_cvt_b32_s32(A) __riscv_vreinterpret_v_i32m1_u32m1(A) #define npyv_cvt_b64_u64(A) A -#define npyv_cvt_b64_s64(A) __riscv_vreinterpret_v_i64m1_u64m1(A) -#define npyv_cvt_b32_f32(A) __riscv_vreinterpret_v_f32m1_u32m1(A) -#define npyv_cvt_b64_f64(A) __riscv_vreinterpret_v_f64m1_u64m1(A) - -NPY_FINLINE npyv_u8 vqtbl1q_u8(npyv_u8 t, npyv_u8 idx) { - vbool8_t mask = __riscv_vmsgeu_vx_u8m1_b8(idx, 16, 16); - return __riscv_vmerge_vxm_u8m1(__riscv_vrgather_vv_u8m1(t, idx, 16), 0, mask, 16); -} +#define npyv_cvt_b8_s8(A) npyv_cvt_b8_u8(__riscv_vreinterpret_v_i8m1_u8m1(A)) +#define npyv_cvt_b16_s16(A) npyv_cvt_b16_u16(__riscv_vreinterpret_v_i16m1_u16m1(A)) +#define npyv_cvt_b32_s32(A) npyv_cvt_b32_u32(__riscv_vreinterpret_v_i32m1_u32m1(A)) +#define npyv_cvt_b64_s64(A) npyv_cvt_b64_u64(__riscv_vreinterpret_v_i64m1_u64m1(A)) +#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(__riscv_vreinterpret_v_f32m1_u32m1(A)) +#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(__riscv_vreinterpret_v_f64m1_u64m1(A)) + +#define npyv__from_b8(A) __riscv_vmseq_vx_u8m1_b8(A, UINT8_MAX, npyv_nlanes_u8) +#define npyv__from_b16(A) __riscv_vmseq_vx_u16m1_b16(A, UINT16_MAX, npyv_nlanes_u16) +#define npyv__from_b32(A) __riscv_vmseq_vx_u32m1_b32(A, UINT32_MAX, npyv_nlanes_u32) +#define npyv__from_b64(A) __riscv_vmseq_vx_u64m1_b64(A, UINT64_MAX, npyv_nlanes_u64) +#define npyv__to_b8(A) __riscv_vmerge_vxm_u8m1(__riscv_vmv_v_x_u8m1(0, npyv_nlanes_u8), UINT8_MAX, A, npyv_nlanes_u8) +#define npyv__to_b16(A) __riscv_vmerge_vxm_u16m1(__riscv_vmv_v_x_u16m1(0, npyv_nlanes_u16), UINT16_MAX, A, npyv_nlanes_u16) +#define npyv__to_b32(A) __riscv_vmerge_vxm_u32m1(__riscv_vmv_v_x_u32m1(0, npyv_nlanes_u32), UINT32_MAX, A, npyv_nlanes_u32) +#define npyv__to_b64(A) __riscv_vmerge_vxm_u64m1(__riscv_vmv_v_x_u64m1(0, npyv_nlanes_u64), UINT64_MAX, A, npyv_nlanes_u64) NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) -{ - const npyv_u8 scale = npyv_set_u8(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128); - npyv_u8 seq_scale = __riscv_vand_vv_u8m1(a, scale, 16); - const npyv_u8 byteOrder = npyv_set_u8(0,8,1,9,2,10,3,11,4,12,5,13,6,14,7,15); - npyv_u8 v0 = vqtbl1q_u8(seq_scale, byteOrder); - return __riscv_vmv_x_s_u32m1_u32(__riscv_vwredsumu_vs_u16m1_u32m1(__riscv_vreinterpret_v_u8m1_u16m1(v0), __riscv_vmv_v_x_u32m1(0, 4), 8)); -} - +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b8_u64m1(npyv__from_b8(a))) & (npyv_nlanes_u8 == 64 ? ~0 : (1ULL << npyv_nlanes_u8) - 1); } NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) -{ - const npyv_u16 scale = npyv_set_u16(1, 2, 4, 8, 16, 32, 64, 128); - npyv_u16 seq_scale = __riscv_vand_vv_u16m1(a, scale, 8); - return __riscv_vmv_x_s_u16m1_u16(__riscv_vredsum_vs_u16m1_u16m1(seq_scale, __riscv_vmv_v_x_u16m1(0, 8), 8)); -} - +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b16_u64m1(npyv__from_b16(a))) & ((1ULL << npyv_nlanes_u16) - 1); } NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) -{ - const npyv_u32 scale = npyv_set_u32(1, 2, 4, 8); - npyv_u32 seq_scale = __riscv_vand_vv_u32m1(a, scale, 4); - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredsum_vs_u32m1_u32m1(seq_scale, __riscv_vmv_v_x_u32m1(0, 4), 4)); -} - +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b32_u64m1(npyv__from_b32(a))) & ((1ULL << npyv_nlanes_u32) - 1); } NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - uint64_t first = __riscv_vmv_x_s_u64m1_u64(a); - uint64_t second = __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, 1, vlen)); - return ((second & 0x2) | (first & 0x1)); -} +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b64_u64m1(npyv__from_b64(a))) & ((1ULL << npyv_nlanes_u64) - 1); } //expand -NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) { - npyv_u16x2 r; - r.val[0] = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(data, 8)); - r.val[1] = __riscv_vlmul_trunc_v_u16m2_u16m1(__riscv_vzext_vf2_u16m2(__riscv_vslidedown_vx_u8m1(data, 8, 16), 8)); - return r; +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) +{ + vuint16m2_t ext = __riscv_vzext_vf2(data, npyv_nlanes_u8); + return (npyv_u16x2){{ + __riscv_vget_v_u16m2_u16m1(ext, 0), + __riscv_vget_v_u16m2_u16m1(ext, 1) + }}; } -NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) { - npyv_u32x2 r; - r.val[0] = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(data, 4)); - r.val[1] = __riscv_vlmul_trunc_v_u32m2_u32m1(__riscv_vzext_vf2_u32m2(__riscv_vslidedown_vx_u16m1(data, 4, 8), 4)); - return r; +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) +{ + vuint32m2_t ext = __riscv_vzext_vf2(data, npyv_nlanes_u16); + return (npyv_u32x2){{ + __riscv_vget_v_u32m2_u32m1(ext, 0), + __riscv_vget_v_u32m2_u32m1(ext, 1) + }}; } // pack two 16-bit boolean into one 8-bit boolean vector -NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) { - npyv_b8 a8 = __riscv_vreinterpret_v_u16m1_u8m1(a); - npyv_b8 b8 = __riscv_vreinterpret_v_u16m1_u8m1(b); - return vuzp1q_u8(a8, b8); +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b16(b) << npyv_nlanes_u16 | + npyv_tobits_b16(a), 1 + ))); } // pack four 32-bit boolean vectors into one 8-bit boolean vector NPY_FINLINE npyv_b8 -npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) { - npyv_b16 a16 = __riscv_vreinterpret_v_u32m1_u16m1(a); - npyv_b16 b16 = __riscv_vreinterpret_v_u32m1_u16m1(b); - npyv_b16 c16 = __riscv_vreinterpret_v_u32m1_u16m1(c); - npyv_b16 d16 = __riscv_vreinterpret_v_u32m1_u16m1(d); - - npyv_b16 ab = vuzp1q_u16(a16, b16); - npyv_b16 cd = vuzp1q_u16(c16, d16); - - return npyv_pack_b8_b16(ab, cd); +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b32(d) << (npyv_nlanes_u32 * 3) | + npyv_tobits_b32(c) << (npyv_nlanes_u32 * 2) | + npyv_tobits_b32(b) << npyv_nlanes_u32 | + npyv_tobits_b32(a), 1 + ))); } // pack eight 64-bit boolean vectors into one 8-bit boolean vector - NPY_FINLINE npyv_b8 - npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, - npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) { - npyv_b32 a32 = __riscv_vreinterpret_v_u64m1_u32m1(a); - npyv_b32 b32 = __riscv_vreinterpret_v_u64m1_u32m1(b); - npyv_b32 c32 = __riscv_vreinterpret_v_u64m1_u32m1(c); - npyv_b32 d32 = __riscv_vreinterpret_v_u64m1_u32m1(d); - npyv_b32 e32 = __riscv_vreinterpret_v_u64m1_u32m1(e); - npyv_b32 f32 = __riscv_vreinterpret_v_u64m1_u32m1(f); - npyv_b32 g32 = __riscv_vreinterpret_v_u64m1_u32m1(g); - npyv_b32 h32 = __riscv_vreinterpret_v_u64m1_u32m1(h); - - npyv_b32 ab = vuzp1q_u32(a32, b32); - npyv_b32 cd = vuzp1q_u32(c32, d32); - npyv_b32 ef = vuzp1q_u32(e32, f32); - npyv_u32 gh = vuzp1q_u32(g32, h32); - - return npyv_pack_b8_b32(ab, cd, ef, gh); - } +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b64(h) << (npyv_nlanes_u64 * 7) | + npyv_tobits_b64(g) << (npyv_nlanes_u64 * 6) | + npyv_tobits_b64(f) << (npyv_nlanes_u64 * 5) | + npyv_tobits_b64(e) << (npyv_nlanes_u64 * 4) | + npyv_tobits_b64(d) << (npyv_nlanes_u64 * 3) | + npyv_tobits_b64(c) << (npyv_nlanes_u64 * 2) | + npyv_tobits_b64(b) << npyv_nlanes_u64 | + npyv_tobits_b64(a), 1 + ))); +} // round to nearest integer NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) { - size_t vlen = __riscv_vsetvlmax_e32m1(); // (round-to-nearest-even) - return __riscv_vfcvt_x_f_v_i32m1(a, vlen); -} - -NPY_FINLINE npyv_s32 vmovn_s64(npyv_s64 a) { - return __riscv_vnsra_wx_i32m1(__riscv_vlmul_ext_v_i64m1_i64m2(a), 0, 2); + return __riscv_vfcvt_x_f_v_i32m1(a, npyv_nlanes_s32); } NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) { - npyv_s64 lo = __riscv_vfcvt_x_f_v_i64m1(a, 2), hi = __riscv_vfcvt_x_f_v_i64m1(b, 2); - return __riscv_vslideup_vx_i32m1(vmovn_s64(lo), vmovn_s64(hi), 2, 4); + return __riscv_vfncvt_x_f_w_i32m1(__riscv_vcreate_v_f64m1_f64m2(a, b), npyv_nlanes_s32); } #endif // _NPY_SIMD_RVV_CVT_H diff --git a/numpy/_core/src/common/simd/rvv/math.h b/numpy/_core/src/common/simd/rvv/math.h index 9b3b34a200dc..de50e2c573b9 100644 --- a/numpy/_core/src/common/simd/rvv/math.h +++ b/numpy/_core/src/common/simd/rvv/math.h @@ -5,590 +5,248 @@ #ifndef _NPY_SIMD_RVV_MATH_H #define _NPY_SIMD_RVV_MATH_H +#include #include /*************************** * Elementary ***************************/ NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfabs_v_f32m1(a, vlen); -} - +{ return __riscv_vfabs_v_f32m1(a, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfabs_v_f64m1(a, vlen); -} +{ return __riscv_vfabs_v_f64m1(a, npyv_nlanes_f64); } // Square NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmul_vv_f32m1(a, a, vlen); -} - +{ return __riscv_vfmul_vv_f32m1(a, a, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmul_vv_f64m1(a, a, vlen); -} +{ return __riscv_vfmul_vv_f64m1(a, a, npyv_nlanes_f64); } // Square root NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfsqrt_v_f32m1(a, vlen); -} - +{ return __riscv_vfsqrt_v_f32m1(a, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_sqrt_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfsqrt_v_f64m1(a, vlen); -} +{ return __riscv_vfsqrt_v_f64m1(a, npyv_nlanes_f64); } // Reciprocal NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - vfloat32m1_t one = __riscv_vfmv_v_f_f32m1(1.0f, vlen); - return __riscv_vfdiv_vv_f32m1(one, a, vlen); -} - +{ return __riscv_vfrdiv_vf_f32m1(a, 1.0f, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - npyv_f64 one = __riscv_vfmv_v_f_f64m1(1.0, vlen); - return __riscv_vfdiv_vv_f64m1(one, a, vlen); -} +{ return __riscv_vfrdiv_vf_f64m1(a, 1.0 , npyv_nlanes_f64); } // Maximum NPY_FINLINE npyv_f32 npyv_max_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmax_vv_f32m1(a, b, vlen); -} - +{ return __riscv_vfmax_vv_f32m1(a, b, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_max_f64(npyv_f64 a, npyv_f64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmax_vv_f64m1(a, b, vlen); -} - -// Maximum -NPY_FINLINE npyv_f32 npyv_maxp_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); +{ return __riscv_vfmax_vv_f64m1(a, b, npyv_nlanes_f64); } - vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); - vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(b, b, vlen); - - vfloat32m1_t sel_a = __riscv_vmerge_vvm_f32m1(b, a, not_nan_a, vlen); - vfloat32m1_t sel_b = __riscv_vmerge_vvm_f32m1(a, b, not_nan_b, vlen); - - return __riscv_vfmax_vv_f32m1(sel_a, sel_b, vlen); -} +// Max, NaN-suppressing +#define npyv_maxp_f32 npyv_max_f32 +#define npyv_maxp_f64 npyv_max_f64 - -// Max, propagates NaNs +// Max, NaN-propagating NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) { - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmax_vv_f32m1(a, b, vlen); + return __riscv_vfmax_vv_f32m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f32), npyv_nlanes_f32), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32), + npyv_nlanes_f32 + ); } - -// Max, NaN-propagating NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) { - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmax_vv_f64m1(a, b, vlen); -} - -// Max, NaN-suppressing -NPY_FINLINE npyv_f64 npyv_maxp_f64(npyv_f64 a, npyv_f64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - - vbool64_t a_is_nan = __riscv_vmfne_vv_f64m1_b64(a, a, vlen); - vbool64_t b_is_nan = __riscv_vmfne_vv_f64m1_b64(b, b, vlen); - - vfloat64m1_t max = __riscv_vfmax_vv_f64m1(a, b, vlen); - max = __riscv_vmerge_vvm_f64m1(max, b, a_is_nan, vlen); - max = __riscv_vmerge_vvm_f64m1(max, a, b_is_nan, vlen); - - return max; + return __riscv_vfmax_vv_f64m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f64), npyv_nlanes_f64), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64), + npyv_nlanes_f64 + ); } // Maximum, integer operations NPY_FINLINE npyv_u8 npyv_max_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vmaxu_vv_u8m1(a, b, 16); -} - +{ return __riscv_vmaxu_vv_u8m1(a, b, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_max_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vmax_vv_i8m1(a, b, 16); -} - +{ return __riscv_vmax_vv_i8m1(a, b, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_max_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vmaxu_vv_u16m1(a, b, vlen); -} - +{ return __riscv_vmaxu_vv_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_max_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vmax_vv_i16m1(a, b, vlen); -} - +{ return __riscv_vmax_vv_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_max_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vmaxu_vv_u32m1(a, b, vlen); -} - +{ return __riscv_vmaxu_vv_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_max_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vmax_vv_i32m1(a, b, vlen); -} - +{ return __riscv_vmax_vv_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_max_u64(npyv_u64 a, npyv_u64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - vbool64_t mask = __riscv_vmsgtu_vv_u64m1_b64(a, b, vlen); - return __riscv_vmerge_vvm_u64m1(b, a, mask, vlen); -} - +{ return __riscv_vmaxu_vv_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - vbool64_t mask = __riscv_vmsgt_vv_i64m1_b64(a, b, vlen); - return __riscv_vmerge_vvm_i64m1(b, a, mask, vlen); -} +{ return __riscv_vmax_vv_i64m1(a, b, npyv_nlanes_s64); } -// Minimum, natively mapping with no guarantees to handle NaN. +// Minimum NPY_FINLINE npyv_f32 npyv_min_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmin_vv_f32m1(a, b, vlen); -} - +{ return __riscv_vfmin_vv_f32m1(a, b, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_min_f64(npyv_f64 a, npyv_f64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmin_vv_f64m1(a, b, vlen); -} - -// Minimum -NPY_FINLINE npyv_f32 npyv_minp_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - - vbool32_t not_nan_a = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); - vbool32_t not_nan_b = __riscv_vmfeq_vv_f32m1_b32(b, b, vlen); +{ return __riscv_vfmin_vv_f64m1(a, b, npyv_nlanes_f64); } - vfloat32m1_t sel_a = __riscv_vmerge_vvm_f32m1(b, a, not_nan_a, vlen); - vfloat32m1_t sel_b = __riscv_vmerge_vvm_f32m1(a, b, not_nan_b, vlen); - - return __riscv_vfmin_vv_f32m1(sel_a, sel_b, vlen); -} +// Min, NaN-suppressing +#define npyv_minp_f32 npyv_min_f32 +#define npyv_minp_f64 npyv_min_f64 -// Min, propagates NaNs -// If any of corresponded element is NaN, NaN is set. +// Min, NaN-propagating NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) { - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmin_vv_f32m1(a, b, vlen); + return __riscv_vfmin_vv_f32m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f32), npyv_nlanes_f32), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32), + npyv_nlanes_f32 + ); } - -// Min, NaN-propagating NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) { - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmin_vv_f64m1(a, b, vlen); -} - -// Min, NaN-suppressing -NPY_FINLINE npyv_f64 npyv_minp_f64(npyv_f64 a, npyv_f64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - - vbool64_t a_is_nan = __riscv_vmfne_vv_f64m1_b64(a, a, vlen); - vbool64_t b_is_nan = __riscv_vmfne_vv_f64m1_b64(b, b, vlen); - - npyv_f64 min = __riscv_vfmin_vv_f64m1(a, b, vlen); - min = __riscv_vmerge_vvm_f64m1(min, b, a_is_nan, vlen); - min = __riscv_vmerge_vvm_f64m1(min, a, b_is_nan, vlen); - - return min; + return __riscv_vfmin_vv_f64m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f64), npyv_nlanes_f64), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64), + npyv_nlanes_f64 + ); } // Minimum, integer operations NPY_FINLINE npyv_u8 npyv_min_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vminu_vv_u8m1(a, b, 16); -} - +{ return __riscv_vminu_vv_u8m1(a, b, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_min_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vmin_vv_i8m1(a, b, 16); -} - +{ return __riscv_vmin_vv_i8m1(a, b, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_min_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vminu_vv_u16m1(a, b, vlen); -} - +{ return __riscv_vminu_vv_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_min_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vmin_vv_i16m1(a, b, vlen); -} - +{ return __riscv_vmin_vv_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_min_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vminu_vv_u32m1(a, b, vlen); -} - +{ return __riscv_vminu_vv_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_min_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vmin_vv_i32m1(a, b, vlen); -} - +{ return __riscv_vmin_vv_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_min_u64(npyv_u64 a, npyv_u64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - vbool64_t mask = __riscv_vmsltu_vv_u64m1_b64(a, b, vlen); - return __riscv_vmerge_vvm_u64m1(b, a, mask, vlen); -} - +{ return __riscv_vminu_vv_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - vbool64_t mask = __riscv_vmslt_vv_i64m1_b64(a, b, vlen); - return __riscv_vmerge_vvm_i64m1(b, a, mask, vlen); -} +{ return __riscv_vmin_vv_i64m1(a, b, npyv_nlanes_s64); } // reduce min/max for all data types // Maximum reductions NPY_FINLINE uint8_t npyv_reduce_max_u8(npyv_u8 a) -{ - return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(0, 16), 16)); -} - +{ return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_s_x_u8m1(0, 1), npyv_nlanes_u8)); } NPY_FINLINE int8_t npyv_reduce_max_s8(npyv_s8 a) -{ - return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmax_vs_i8m1_i8m1(a, __riscv_vmv_v_x_i8m1(INT8_MIN, 16), 16)); -} - +{ return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmax_vs_i8m1_i8m1(a, __riscv_vmv_s_x_i8m1(INT8_MIN, 1), npyv_nlanes_s8)); } NPY_FINLINE uint16_t npyv_reduce_max_u16(npyv_u16 a) -{ - return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 8)); -} - +{ return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_s_x_u16m1(0, 1), npyv_nlanes_u16)); } NPY_FINLINE int16_t npyv_reduce_max_s16(npyv_s16 a) -{ - return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmax_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(INT16_MIN, 8), 8)); -} - +{ return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmax_vs_i16m1_i16m1(a, __riscv_vmv_s_x_i16m1(INT16_MIN, 1), npyv_nlanes_s16)); } NPY_FINLINE uint32_t npyv_reduce_max_u32(npyv_u32 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)); -} - +{ return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u32)); } NPY_FINLINE int32_t npyv_reduce_max_s32(npyv_s32 a) -{ - return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmax_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(INT32_MIN, 4), 4)); -} +{ return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmax_vs_i32m1_i32m1(a, __riscv_vmv_s_x_i32m1(INT32_MIN, 1), npyv_nlanes_s32)); } +NPY_FINLINE uint64_t npyv_reduce_max_u64(npyv_u64 a) +{ return __riscv_vmv_x_s_u64m1_u64(__riscv_vredmaxu_vs_u64m1_u64m1(a, __riscv_vmv_s_x_u64m1(0, 1), npyv_nlanes_u64)); } +NPY_FINLINE int64_t npyv_reduce_max_s64(npyv_s64 a) +{ return __riscv_vmv_x_s_i64m1_i64(__riscv_vredmax_vs_i64m1_i64m1(a, __riscv_vmv_s_x_i64m1(INT64_MIN, 1), npyv_nlanes_s64)); } // Floating-point maximum reductions NPY_FINLINE float npyv_reduce_max_f32(npyv_f32 a) -{ - uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b32_u8m1(__riscv_vmfeq_vv_f32m1_b32(a, a, 4))); - if ((mask & 0b1111) != 0b1111) { - return NAN; - } - return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(-FLT_MAX, 4), 4)); -} - +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32) != -1 ? __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a, __riscv_vfmv_s_f_f32m1(-INFINITY, 1), npyv_nlanes_f32)) : NAN; } NPY_FINLINE double npyv_reduce_max_f64(npyv_f64 a) -{ - uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b64_u8m1(__riscv_vmfeq_vv_f64m1_b64(a, a, 2))); - if ((mask & 0b11) != 0b11) { - return NAN; - } - return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(-DBL_MAX, 2), 2)); -} - -// NaN-propagating maximum reductions -#define npyv_reduce_maxn_f32 npyv_reduce_max_f32 -#define npyv_reduce_maxn_f64 npyv_reduce_max_f64 +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64) != -1 ? __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a, __riscv_vfmv_s_f_f64m1(-INFINITY, 1), npyv_nlanes_f64)) : NAN; } // NaN-suppressing maximum reductions -NPY_FINLINE float npyv_reduce_maxp_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - - vbool32_t valid_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); - - vbool32_t nan_mask = __riscv_vmnot_m_b32(valid_mask, vlen); - - npyv_f32 masked_a = __riscv_vfmerge_vfm_f32m1( - a, - -INFINITY, - nan_mask, - vlen - ); - - return __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredmax_vs_f32m1_f32m1( - masked_a, - __riscv_vfmv_v_f_f32m1(-INFINITY, vlen), - vlen - ) - ); -} +#define npyv_reduce_maxp_f32 npyv_reduce_max_f32 +#define npyv_reduce_maxp_f64 npyv_reduce_max_f64 -NPY_FINLINE double npyv_reduce_maxp_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - - vbool64_t valid_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, vlen); - - vbool64_t nan_mask = __riscv_vmnot_m_b64(valid_mask, vlen); - - npyv_f64 masked_a = __riscv_vfmerge_vfm_f64m1( - a, - -INFINITY, - nan_mask, - vlen - ); - - return __riscv_vfmv_f_s_f64m1_f64( - __riscv_vfredmax_vs_f64m1_f64m1( - masked_a, - __riscv_vfmv_v_f_f64m1(-INFINITY, vlen), - vlen - ) - ); -} +// NaN-propagating maximum reductions +NPY_FINLINE float npyv_reduce_maxn_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f32), npyv_nlanes_f32) == -1 ? npyv_reduce_max_f32(a) : NAN; } +NPY_FINLINE double npyv_reduce_maxn_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f64), npyv_nlanes_f64) == -1 ? npyv_reduce_max_f64(a) : NAN; } // Minimum reductions NPY_FINLINE uint8_t npyv_reduce_min_u8(npyv_u8 a) -{ - return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)); -} - +{ return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_s_x_u8m1(UINT8_MAX, 1), npyv_nlanes_u8)); } NPY_FINLINE int8_t npyv_reduce_min_s8(npyv_s8 a) -{ - return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmin_vs_i8m1_i8m1(a, __riscv_vmv_v_x_i8m1(INT8_MAX, 16), 16)); -} - +{ return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmin_vs_i8m1_i8m1(a, __riscv_vmv_s_x_i8m1(INT8_MAX, 1), npyv_nlanes_s8)); } NPY_FINLINE uint16_t npyv_reduce_min_u16(npyv_u16 a) -{ - return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), 8)); -} - +{ return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_s_x_u16m1(UINT16_MAX, 1), npyv_nlanes_u16)); } NPY_FINLINE int16_t npyv_reduce_min_s16(npyv_s16 a) -{ - return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmin_vs_i16m1_i16m1(a, __riscv_vmv_v_x_i16m1(INT16_MAX, 8), 8)); -} - +{ return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmin_vs_i16m1_i16m1(a, __riscv_vmv_s_x_i16m1(INT16_MAX, 1), npyv_nlanes_s16)); } NPY_FINLINE uint32_t npyv_reduce_min_u32(npyv_u32 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)); -} - +{ return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_s_x_u32m1(UINT32_MAX, 1), npyv_nlanes_u32)); } NPY_FINLINE int32_t npyv_reduce_min_s32(npyv_s32 a) -{ - return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmin_vs_i32m1_i32m1(a, __riscv_vmv_v_x_i32m1(INT32_MAX, 4), 4)); -} +{ return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmin_vs_i32m1_i32m1(a, __riscv_vmv_s_x_i32m1(INT32_MAX, 1), npyv_nlanes_s32)); } +NPY_FINLINE uint64_t npyv_reduce_min_u64(npyv_u64 a) +{ return __riscv_vmv_x_s_u64m1_u64(__riscv_vredminu_vs_u64m1_u64m1(a, __riscv_vmv_s_x_u64m1(UINT64_MAX, 1), npyv_nlanes_u64)); } +NPY_FINLINE int64_t npyv_reduce_min_s64(npyv_s64 a) +{ return __riscv_vmv_x_s_i64m1_i64(__riscv_vredmin_vs_i64m1_i64m1(a, __riscv_vmv_s_x_i64m1(INT64_MAX, 1), npyv_nlanes_s64)); } // Floating-point minimum reductions NPY_FINLINE float npyv_reduce_min_f32(npyv_f32 a) -{ - uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b32_u8m1(__riscv_vmfeq_vv_f32m1_b32(a, a, 4))); - if ((mask & 0b1111) != 0b1111) { - return NAN; - } - return __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a, __riscv_vfmv_v_f_f32m1(FLT_MAX, 4), 4)); -} - +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32) != -1 ? __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a, __riscv_vfmv_s_f_f32m1(INFINITY, 1), npyv_nlanes_f32)) : NAN; } NPY_FINLINE double npyv_reduce_min_f64(npyv_f64 a) -{ - uint8_t mask = __riscv_vmv_x_s_u8m1_u8(__riscv_vreinterpret_v_b64_u8m1(__riscv_vmfeq_vv_f64m1_b64(a, a, 2))); - if ((mask & 0b11) != 0b11) { - return NAN; - } - return __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a, __riscv_vfmv_v_f_f64m1(DBL_MAX, 2), 2)); -} - -// NaN-propagating minimum reductions -#define npyv_reduce_minn_f32 npyv_reduce_min_f32 -#define npyv_reduce_minn_f64 npyv_reduce_min_f64 +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64) != -1 ? __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a, __riscv_vfmv_s_f_f64m1(INFINITY, 1), npyv_nlanes_f64)) : NAN; } // NaN-suppressing minimum reductions -NPY_FINLINE float npyv_reduce_minp_f32(npyv_f32 a) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - - vbool32_t valid_mask = __riscv_vmfeq_vv_f32m1_b32(a, a, vlen); +#define npyv_reduce_minp_f32 npyv_reduce_min_f32 +#define npyv_reduce_minp_f64 npyv_reduce_min_f64 - vbool32_t nan_mask = __riscv_vmnot_m_b32(valid_mask, vlen); - - npyv_f32 masked_a = __riscv_vfmerge_vfm_f32m1( - a, - INFINITY, - nan_mask, - vlen - ); - - return __riscv_vfmv_f_s_f32m1_f32( - __riscv_vfredmin_vs_f32m1_f32m1( - masked_a, - __riscv_vfmv_v_f_f32m1(INFINITY, vlen), - vlen - ) - ); -} - -NPY_FINLINE double npyv_reduce_minp_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - - vbool64_t valid_mask = __riscv_vmfeq_vv_f64m1_b64(a, a, vlen); - - vbool64_t nan_mask = __riscv_vmnot_m_b64(valid_mask, vlen); - - npyv_f64 masked_a = __riscv_vfmerge_vfm_f64m1( - a, - INFINITY, - nan_mask, - vlen - ); - - return __riscv_vfmv_f_s_f64m1_f64( - __riscv_vfredmin_vs_f64m1_f64m1( - masked_a, - __riscv_vfmv_v_f_f64m1(INFINITY, vlen), - vlen - ) - ); -} - -// Maximum reductions for 64-bit integers -NPY_FINLINE npy_uint64 npyv_reduce_max_u64(npyv_u64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_x_s_u64m1_u64( - __riscv_vredmax_vs_u64m1_u64m1( - a, - __riscv_vmv_v_x_u64m1(0, vlen), - vlen - ) - ); -} - -NPY_FINLINE npy_int64 npyv_reduce_max_s64(npyv_s64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_x_s_i64m1_i64( - __riscv_vredmax_vs_i64m1_i64m1( - a, - __riscv_vmv_v_x_i64m1(INT64_MIN, vlen), - vlen - ) - ); -} - -NPY_FINLINE npy_uint64 npyv_reduce_min_u64(npyv_u64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_x_s_u64m1_u64( - __riscv_vredmin_vs_u64m1_u64m1( - a, - __riscv_vmv_v_x_u64m1(UINT64_MAX, vlen), - vlen - ) - ); -} - -NPY_FINLINE npy_int64 npyv_reduce_min_s64(npyv_s64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_x_s_i64m1_i64( - __riscv_vredmin_vs_i64m1_i64m1( - a, - __riscv_vmv_v_x_i64m1(INT64_MAX, vlen), - vlen - ) - ); -} +// NaN-propagating minimum reductions +NPY_FINLINE float npyv_reduce_minn_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f32), npyv_nlanes_f32) == -1 ? npyv_reduce_min_f32(a) : NAN; } +NPY_FINLINE double npyv_reduce_minn_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f64), npyv_nlanes_f64) == -1 ? npyv_reduce_min_f64(a) : NAN; } + +#define NPYV_IMPL_RVV_FCVT(TYPE, FRM) \ + NPY_FINLINE npyv_f32 npyv_##TYPE##_f32(npyv_f32 a) \ + { \ + const int vl = npyv_nlanes_f32; \ + const vfloat32m1_t b = __riscv_vmerge( \ + a, \ + __riscv_vfcvt_f(__riscv_vfcvt_x_f_v_i32m1_rm( \ + a, FRM, vl), vl \ + ), \ + __riscv_vmfle( \ + __riscv_vfabs(a, vl), 1e9, vl \ + ), vl \ + ); \ + feclearexcept(FE_INVALID); \ + return __riscv_vreinterpret_f32m1(__riscv_vor( \ + __riscv_vand( \ + __riscv_vreinterpret_u32m1(a), \ + 1 << 31, vl \ + ), \ + __riscv_vreinterpret_u32m1(b), vl \ + )); \ + } \ + NPY_FINLINE npyv_f64 npyv_##TYPE##_f64(npyv_f64 a) \ + { \ + const int vl = npyv_nlanes_f64; \ + const vfloat64m1_t b = __riscv_vmerge( \ + a, \ + __riscv_vfcvt_f(__riscv_vfcvt_x_f_v_i64m1_rm( \ + a, FRM, vl), vl \ + ), \ + __riscv_vmfle( \ + __riscv_vfabs(a, vl), 1e18, vl \ + ), vl \ + ); \ + feclearexcept(FE_INVALID); \ + return __riscv_vreinterpret_f64m1(__riscv_vor( \ + __riscv_vand( \ + __riscv_vreinterpret_u64m1(a), \ + 1ULL << 63, vl \ + ), \ + __riscv_vreinterpret_u64m1(b), vl \ + )); \ + } // round to nearest integer even -NPY_FINLINE npyv_f32 npyv_rint_f32(npyv_f32 a) -{ - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RNE, 4), 4); -} - -NPY_FINLINE npyv_f64 npyv_rint_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfcvt_f_x_v_f64m1( - __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RNE, vlen), - vlen - ); -} - -// ceil -NPY_FINLINE npyv_f32 npyv_ceil_f32(npyv_f32 a) -{ - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RUP, 4), 4); -} - -NPY_FINLINE npyv_f64 npyv_ceil_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfcvt_f_x_v_f64m1( - __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RUP, vlen), - vlen - ); -} - +NPYV_IMPL_RVV_FCVT(rint, __RISCV_FRM_RNE) // trunc -NPY_FINLINE npyv_f32 npyv_trunc_f32(npyv_f32 a) -{ - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RTZ, 4), 4); -} - -NPY_FINLINE npyv_f64 npyv_trunc_f64(npyv_f64 a) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfcvt_f_x_v_f64m1( - __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RTZ, vlen), - vlen - ); -} - +NPYV_IMPL_RVV_FCVT(trunc, __RISCV_FRM_RTZ) +// ceil +NPYV_IMPL_RVV_FCVT(ceil, __RISCV_FRM_RUP) // floor -NPY_FINLINE npyv_f32 npyv_floor_f32(npyv_f32 a) -{ - return __riscv_vfcvt_f_x_v_f32m1(__riscv_vfcvt_x_f_v_i32m1_rm(a, __RISCV_FRM_RDN, 4), 4); -} - -NPY_FINLINE npyv_f64 npyv_floor_f64(npyv_f64 a) -{ - size_t vl = __riscv_vsetvlmax_e64m1(); - return __riscv_vfcvt_f_x_v_f64m1( - __riscv_vfcvt_x_f_v_i64m1_rm(a, __RISCV_FRM_RDN, vl), - vl - ); -} +NPYV_IMPL_RVV_FCVT(floor, __RISCV_FRM_RDN) +#undef NPYV_IMPL_RVV_FCVT #endif // _NPY_SIMD_RVV_MATH_H diff --git a/numpy/_core/src/common/simd/rvv/memory.h b/numpy/_core/src/common/simd/rvv/memory.h index 5c847b115aad..713732fff08d 100644 --- a/numpy/_core/src/common/simd/rvv/memory.h +++ b/numpy/_core/src/common/simd/rvv/memory.h @@ -11,529 +11,305 @@ * load/store ***************************/ // GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors + // uint8_t NPY_FINLINE npyv_u8 npyv_load_u8(const npyv_lanetype_u8 *ptr) -{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } - +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } NPY_FINLINE npyv_u8 npyv_loada_u8(const npyv_lanetype_u8 *ptr) -{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } - +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } NPY_FINLINE npyv_u8 npyv_loads_u8(const npyv_lanetype_u8 *ptr) -{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, 16); } - +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } NPY_FINLINE npyv_u8 npyv_loadl_u8(const npyv_lanetype_u8 *ptr) -{ - return __riscv_vslideup_vx_u8m1(__riscv_vle8_v_u8m1((const uint8_t*)ptr, 8), __riscv_vmv_v_x_u8m1(0, 8), 8, 16); -} +{ return __riscv_vle8_v_u8m1_tu(__riscv_vmv_v_x_u8m1(0, npyv_nlanes_u8), (const uint8_t*)ptr, npyv_nlanes_u8 / 2); } NPY_FINLINE void npyv_store_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) -{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } NPY_FINLINE void npyv_storea_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) -{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } NPY_FINLINE void npyv_stores_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) -{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } NPY_FINLINE void npyv_storel_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) -{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, 8); } - +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8 / 2); } NPY_FINLINE void npyv_storeh_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) -{ __riscv_vse8_v_u8m1((uint8_t*)ptr, __riscv_vslidedown_vx_u8m1(vec, 8, 16), 8); } - +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, __riscv_vslidedown_vx_u8m1(vec, npyv_nlanes_u8 / 2, npyv_nlanes_u8), npyv_nlanes_u8 / 2); } // int8_t NPY_FINLINE npyv_s8 npyv_load_s8(const npyv_lanetype_s8 *ptr) -{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } - +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } NPY_FINLINE npyv_s8 npyv_loada_s8(const npyv_lanetype_s8 *ptr) -{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } - +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } NPY_FINLINE npyv_s8 npyv_loads_s8(const npyv_lanetype_s8 *ptr) -{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, 16); } - +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } NPY_FINLINE npyv_s8 npyv_loadl_s8(const npyv_lanetype_s8 *ptr) -{ - return __riscv_vslideup_vx_i8m1(__riscv_vle8_v_i8m1((const int8_t*)ptr, 8), __riscv_vmv_v_x_i8m1(0, 8), 8, 16); -} +{ return __riscv_vle8_v_i8m1_tu(__riscv_vmv_v_x_i8m1(0, npyv_nlanes_s8), (const int8_t*)ptr, npyv_nlanes_s8 / 2); } NPY_FINLINE void npyv_store_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) -{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } NPY_FINLINE void npyv_storea_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) -{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } NPY_FINLINE void npyv_stores_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) -{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 16); } - +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } NPY_FINLINE void npyv_storel_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) -{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, 8); } - +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8 / 2); } NPY_FINLINE void npyv_storeh_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) -{ __riscv_vse8_v_i8m1((int8_t*)ptr, __riscv_vslidedown_vx_i8m1(vec, 8, 16), 8); } +{ __riscv_vse8_v_i8m1((int8_t*)ptr, __riscv_vslidedown_vx_i8m1(vec, npyv_nlanes_s8 / 2, npyv_nlanes_s8), npyv_nlanes_s8 / 2); } // uint16_t NPY_FINLINE npyv_u16 npyv_load_u16(const npyv_lanetype_u16 *ptr) -{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } - +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } NPY_FINLINE npyv_u16 npyv_loada_u16(const npyv_lanetype_u16 *ptr) -{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } - +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } NPY_FINLINE npyv_u16 npyv_loads_u16(const npyv_lanetype_u16 *ptr) -{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, 8); } - +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } NPY_FINLINE npyv_u16 npyv_loadl_u16(const npyv_lanetype_u16 *ptr) -{ - return __riscv_vslideup_vx_u16m1( - __riscv_vle16_v_u16m1((const uint16_t*)ptr, 4), __riscv_vmv_v_x_u16m1(0, 4), 4, 8 - ); -} +{ return __riscv_vle16_v_u16m1_tu(__riscv_vmv_v_x_u16m1(0, npyv_nlanes_u16), (const uint16_t*)ptr, npyv_nlanes_u16 / 2); } NPY_FINLINE void npyv_store_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) -{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } NPY_FINLINE void npyv_storea_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) -{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } NPY_FINLINE void npyv_stores_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) -{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } NPY_FINLINE void npyv_storel_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) -{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, 4); } - +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16 / 2); } NPY_FINLINE void npyv_storeh_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) -{ __riscv_vse16_v_u16m1((uint16_t*)ptr, __riscv_vslidedown_vx_u16m1(vec, 4, 8), 4); } +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, __riscv_vslidedown_vx_u16m1(vec, npyv_nlanes_u16 / 2, npyv_nlanes_u16), npyv_nlanes_u16 / 2); } // int16_t NPY_FINLINE npyv_s16 npyv_load_s16(const npyv_lanetype_s16 *ptr) -{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } - +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } NPY_FINLINE npyv_s16 npyv_loada_s16(const npyv_lanetype_s16 *ptr) -{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } - +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } NPY_FINLINE npyv_s16 npyv_loads_s16(const npyv_lanetype_s16 *ptr) -{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, 8); } - +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } NPY_FINLINE npyv_s16 npyv_loadl_s16(const npyv_lanetype_s16 *ptr) -{ - return __riscv_vslideup_vx_i16m1( - __riscv_vle16_v_i16m1((const int16_t*)ptr, 4), __riscv_vmv_v_x_i16m1(0, 4), 4, 8 - ); -} +{ return __riscv_vle16_v_i16m1_tu(__riscv_vmv_v_x_i16m1(0, npyv_nlanes_s16), (const int16_t*)ptr, npyv_nlanes_s16 / 2); } NPY_FINLINE void npyv_store_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) -{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } NPY_FINLINE void npyv_storea_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) -{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } NPY_FINLINE void npyv_stores_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) -{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 8); } - +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } NPY_FINLINE void npyv_storel_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) -{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, 4); } - +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16 / 2); } NPY_FINLINE void npyv_storeh_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) -{ __riscv_vse16_v_i16m1((int16_t*)ptr, __riscv_vslidedown_vx_i16m1(vec, 4, 8), 4); } +{ __riscv_vse16_v_i16m1((int16_t*)ptr, __riscv_vslidedown_vx_i16m1(vec, npyv_nlanes_s16 / 2, npyv_nlanes_s16), npyv_nlanes_s16 / 2); } // uint32_t NPY_FINLINE npyv_u32 npyv_load_u32(const npyv_lanetype_u32 *ptr) -{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } - +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } NPY_FINLINE npyv_u32 npyv_loada_u32(const npyv_lanetype_u32 *ptr) -{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } - +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } NPY_FINLINE npyv_u32 npyv_loads_u32(const npyv_lanetype_u32 *ptr) -{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, 4); } - +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } NPY_FINLINE npyv_u32 npyv_loadl_u32(const npyv_lanetype_u32 *ptr) -{ - return __riscv_vslideup_vx_u32m1( - __riscv_vle32_v_u32m1((const uint32_t*)ptr, 2), __riscv_vmv_v_x_u32m1(0, 2), 2, 4 - ); -} +{ return __riscv_vle32_v_u32m1_tu(__riscv_vmv_v_x_u32m1(0, npyv_nlanes_u32), (const uint32_t*)ptr, npyv_nlanes_u32 / 2); } NPY_FINLINE void npyv_store_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) -{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } NPY_FINLINE void npyv_storea_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) -{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } NPY_FINLINE void npyv_stores_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) -{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } NPY_FINLINE void npyv_storel_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) -{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, 2); } - +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32 / 2); } NPY_FINLINE void npyv_storeh_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) -{ __riscv_vse32_v_u32m1((uint32_t*)ptr, __riscv_vslidedown_vx_u32m1(vec, 2, 4), 2); } +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, __riscv_vslidedown_vx_u32m1(vec, npyv_nlanes_u32 / 2, npyv_nlanes_u32), npyv_nlanes_u32 / 2); } -// int32_t +// int32_t NPY_FINLINE npyv_s32 npyv_load_s32(const npyv_lanetype_s32 *ptr) -{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } - +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } NPY_FINLINE npyv_s32 npyv_loada_s32(const npyv_lanetype_s32 *ptr) -{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } - +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } NPY_FINLINE npyv_s32 npyv_loads_s32(const npyv_lanetype_s32 *ptr) -{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, 4); } - +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } NPY_FINLINE npyv_s32 npyv_loadl_s32(const npyv_lanetype_s32 *ptr) -{ - return __riscv_vslideup_vx_i32m1( - __riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(0, 2), 2, 4 - ); -} +{ return __riscv_vle32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(0, npyv_nlanes_s32), (const int32_t*)ptr, npyv_nlanes_s32 / 2); } NPY_FINLINE void npyv_store_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) -{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } NPY_FINLINE void npyv_storea_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) -{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } NPY_FINLINE void npyv_stores_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) -{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 4); } - +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } NPY_FINLINE void npyv_storel_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) -{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, 2); } - +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32 / 2); } NPY_FINLINE void npyv_storeh_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) -{ __riscv_vse32_v_i32m1((int32_t*)ptr, __riscv_vslidedown_vx_i32m1(vec, 2, 4), 2); } +{ __riscv_vse32_v_i32m1((int32_t*)ptr, __riscv_vslidedown_vx_i32m1(vec, npyv_nlanes_s32 / 2, npyv_nlanes_s32), npyv_nlanes_s32 / 2); } -// uint64_t +// uint64_t NPY_FINLINE npyv_u64 npyv_load_u64(const npyv_lanetype_u64 *ptr) -{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } - +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } NPY_FINLINE npyv_u64 npyv_loada_u64(const npyv_lanetype_u64 *ptr) -{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } - +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } NPY_FINLINE npyv_u64 npyv_loads_u64(const npyv_lanetype_u64 *ptr) -{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, 2); } - +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } NPY_FINLINE npyv_u64 npyv_loadl_u64(const npyv_lanetype_u64 *ptr) -{ - return __riscv_vslideup_vx_u64m1( - __riscv_vle64_v_u64m1((const uint64_t*)ptr, 1), __riscv_vmv_v_x_u64m1(0, 1), 1, 2 - ); -} +{ return __riscv_vle64_v_u64m1_tu(__riscv_vmv_v_x_u64m1(0, npyv_nlanes_u64), (const uint64_t*)ptr, npyv_nlanes_u64 / 2); } NPY_FINLINE void npyv_store_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) -{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } NPY_FINLINE void npyv_storea_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) -{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } NPY_FINLINE void npyv_stores_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) -{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } NPY_FINLINE void npyv_storel_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) -{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, 1); } - +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64 / 2); } NPY_FINLINE void npyv_storeh_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) -{ __riscv_vse64_v_u64m1((uint64_t*)ptr, __riscv_vslidedown_vx_u64m1(vec, 1, 2), 1); } +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, __riscv_vslidedown_vx_u64m1(vec, npyv_nlanes_u64 / 2, npyv_nlanes_u64), npyv_nlanes_u64 / 2); } -// int64_t +// int64_t NPY_FINLINE npyv_s64 npyv_load_s64(const npyv_lanetype_s64 *ptr) -{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } - +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } NPY_FINLINE npyv_s64 npyv_loada_s64(const npyv_lanetype_s64 *ptr) -{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } - +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } NPY_FINLINE npyv_s64 npyv_loads_s64(const npyv_lanetype_s64 *ptr) -{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, 2); } - +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } NPY_FINLINE npyv_s64 npyv_loadl_s64(const npyv_lanetype_s64 *ptr) -{ - return __riscv_vslideup_vx_i64m1( - __riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vmv_v_x_i64m1(0, 1), 1, 2 - ); -} +{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(0, npyv_nlanes_s64), (const int64_t*)ptr, npyv_nlanes_s64 / 2); } NPY_FINLINE void npyv_store_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) -{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } NPY_FINLINE void npyv_storea_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) -{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } NPY_FINLINE void npyv_stores_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) -{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 2); } - +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } NPY_FINLINE void npyv_storel_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) -{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, 1); } - +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64 / 2); } NPY_FINLINE void npyv_storeh_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) -{ __riscv_vse64_v_i64m1((int64_t*)ptr, __riscv_vslidedown_vx_i64m1(vec, 1, 2), 1); } +{ __riscv_vse64_v_i64m1((int64_t*)ptr, __riscv_vslidedown_vx_i64m1(vec, npyv_nlanes_s64 / 2, npyv_nlanes_s64), npyv_nlanes_s64 / 2); } -// float +// float NPY_FINLINE npyv_f32 npyv_load_f32(const npyv_lanetype_f32 *ptr) -{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } - +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } NPY_FINLINE npyv_f32 npyv_loada_f32(const npyv_lanetype_f32 *ptr) -{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } - +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } NPY_FINLINE npyv_f32 npyv_loads_f32(const npyv_lanetype_f32 *ptr) -{ return __riscv_vle32_v_f32m1((const float*)ptr, 4); } - +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } NPY_FINLINE npyv_f32 npyv_loadl_f32(const npyv_lanetype_f32 *ptr) -{ - return __riscv_vslideup_vx_f32m1( - __riscv_vle32_v_f32m1((const float*)ptr, 2), __riscv_vfmv_v_f_f32m1(0, 2), 2, 4 - ); -} +{ return __riscv_vle32_v_f32m1_tu(__riscv_vfmv_v_f_f32m1(0.0f, npyv_nlanes_f32), (const float*)ptr, npyv_nlanes_f32 / 2); } NPY_FINLINE void npyv_store_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) -{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } - +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } NPY_FINLINE void npyv_storea_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) -{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } - +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } NPY_FINLINE void npyv_stores_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) -{ __riscv_vse32_v_f32m1((float*)ptr, vec, 4); } - +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } NPY_FINLINE void npyv_storel_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) -{ __riscv_vse32_v_f32m1((float*)ptr, vec, 2); } - +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32 / 2); } NPY_FINLINE void npyv_storeh_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) -{ __riscv_vse32_v_f32m1((float*)ptr, __riscv_vslidedown_vx_f32m1(vec, 2, 4), 2); } +{ __riscv_vse32_v_f32m1((float*)ptr, __riscv_vslidedown_vx_f32m1(vec, npyv_nlanes_f32 / 2, npyv_nlanes_f32), npyv_nlanes_f32 / 2); } - -// double +// double NPY_FINLINE npyv_f64 npyv_load_f64(const npyv_lanetype_f64 *ptr) -{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } - +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_loada_f64(const npyv_lanetype_f64 *ptr) -{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } - +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_loads_f64(const npyv_lanetype_f64 *ptr) -{ return __riscv_vle64_v_f64m1((const double*)ptr, 2); } - +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } NPY_FINLINE npyv_f64 npyv_loadl_f64(const npyv_lanetype_f64 *ptr) -{ - return __riscv_vslideup_vx_f64m1( - __riscv_vle64_v_f64m1((const double*)ptr, 1), __riscv_vfmv_v_f_f64m1(0, 1), 1, 2 - ); -} +{ return __riscv_vle64_v_f64m1_tu(__riscv_vfmv_v_f_f64m1(0.0, npyv_nlanes_f64), (const double*)ptr, npyv_nlanes_f64 / 2); } NPY_FINLINE void npyv_store_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) -{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } - +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } NPY_FINLINE void npyv_storea_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) -{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } - +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } NPY_FINLINE void npyv_stores_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) -{ __riscv_vse64_v_f64m1((double*)ptr, vec, 2); } - +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } NPY_FINLINE void npyv_storel_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) -{ __riscv_vse64_v_f64m1((double*)ptr, vec, 1); } - +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64 / 2); } NPY_FINLINE void npyv_storeh_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) -{ __riscv_vse64_v_f64m1((double*)ptr, __riscv_vslidedown_vx_f64m1(vec, 1, 2), 1); } +{ __riscv_vse64_v_f64m1((double*)ptr, __riscv_vslidedown_vx_f64m1(vec, npyv_nlanes_f64 / 2, npyv_nlanes_f64), npyv_nlanes_f64 / 2); } /*************************** * Non-contiguous Load ***************************/ -NPY_FINLINE npyv_s32 vld1q_lane_s32(const int32_t *a, npyv_s32 b, const int lane) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); - npyv_s32 a_dup = __riscv_vmv_v_x_i32m1(a[0], 4); - return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 4); -} - -NPY_FINLINE void vst1q_lane_s32(int32_t *a, npyv_s32 b, const int lane) { - npyv_s32 b_s = __riscv_vslidedown_vx_i32m1(b, lane, 4); - *a = __riscv_vmv_x_s_i32m1_i32(b_s); -} - NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) -{ - npyv_s32 a = __riscv_vmv_v_x_i32m1(0, 4); - a = vld1q_lane_s32((const int32_t*)ptr, a, 0); - a = vld1q_lane_s32((const int32_t*)ptr + stride, a, 1); - a = vld1q_lane_s32((const int32_t*)ptr + stride*2, a, 2); - a = vld1q_lane_s32((const int32_t*)ptr + stride*3, a, 3); - return a; -} - +{ return __riscv_vlse32_v_i32m1((const int32_t*)ptr, stride * sizeof(int32_t), npyv_nlanes_s32); } NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) -{ - return npyv_reinterpret_u32_s32( - npyv_loadn_s32((const npy_int32*)ptr, stride) - ); -} +{ return __riscv_vlse32_v_u32m1((const uint32_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u32); } NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) -{ - return npyv_reinterpret_f32_s32( - npyv_loadn_s32((const npy_int32*)ptr, stride) - ); -} +{ return __riscv_vlse32_v_f32m1((const float*)ptr, stride * sizeof(float), npyv_nlanes_f32); } NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) -{ - return __riscv_vslideup_vx_i64m1( - __riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vle64_v_i64m1((const int64_t*)ptr + stride, 1), 1, 2 - ); -} +{ return __riscv_vlse64_v_i64m1((const int64_t*)ptr, stride * sizeof(int64_t), npyv_nlanes_s64); } NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) -{ - return npyv_reinterpret_u64_s64( - npyv_loadn_s64((const npy_int64*)ptr, stride) - ); -} - +{ return __riscv_vlse64_v_u64m1((const uint64_t*)ptr, stride * sizeof(uint64_t), npyv_nlanes_u64); } NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) -{ - return npyv_reinterpret_f64_s64( - npyv_loadn_s64((const npy_int64*)ptr, stride) - ); -} +{ return __riscv_vlse64_v_f64m1((const double*)ptr, stride * sizeof(double), npyv_nlanes_f64); } //// 64-bit load over 32-bit stride NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) -{ - return __riscv_vslideup_vx_u32m1( - __riscv_vle32_v_u32m1((const uint32_t*)ptr, 2), __riscv_vle32_v_u32m1((const uint32_t*)ptr + stride, 2), 2, 4 - ); -} - +{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vlse64_v_u64m1((const uint64_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u64)); } NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) { return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } - NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) { return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } //// 128-bit load over 64-bit stride NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) -{ (void)stride; return npyv_load_u64(ptr); } - +{ + vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); + id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); + return __riscv_vloxei64_v_u64m1((const uint64_t*)ptr, id, npyv_nlanes_u64); +} NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) -{ (void)stride; return npyv_load_s64(ptr); } +{ return npyv_reinterpret_s64_u64(npyv_loadn2_u64((const npy_uint64*)ptr, stride)); } NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) -{ (void)stride; return npyv_load_f64(ptr); } +{ return npyv_reinterpret_f64_u64(npyv_loadn2_u64((const npy_uint64*)ptr, stride)); } /*************************** * Non-contiguous Store ***************************/ -NPY_FINLINE npyv_s32 vld1_lane_s32(const int32_t *a, npyv_s32 b, const int lane) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); - npyv_s32 a_dup = __riscv_vmv_v_x_i32m1(a[0], 2); - return __riscv_vmerge_vvm_i32m1(b, a_dup, mask, 2); -} - -NPY_FINLINE npyv_u32 vld1_lane_u32(const uint32_t *a, npyv_u32 b, const int lane) { - vbool32_t mask = __riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1((uint8_t)(1 << lane), 8)); - npyv_u32 a_dup = __riscv_vmv_v_x_u32m1(a[0], 2); - return __riscv_vmerge_vvm_u32m1(b, a_dup, mask, 2); -} - -NPY_FINLINE void vst1q_lane_u64(uint64_t *a, npyv_u64 b, const int c) { - npyv_u64 b_s = __riscv_vslidedown_vx_u64m1(b, c, 1); - *a = __riscv_vmv_x_s_u64m1_u64(b_s); -} - -NPY_FINLINE void vst1q_lane_s64(int64_t *a, npyv_s64 b, const int c) { - npyv_s64 b_s = __riscv_vslidedown_vx_i64m1(b, c, 1); - *a = __riscv_vmv_x_s_i64m1_i64(b_s); -} - NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) -{ - vst1q_lane_s32((int32_t*)ptr, a, 0); - vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); - vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); -} - +{ __riscv_vsse32((int32_t*)ptr, stride * sizeof(int32_t), a, npyv_nlanes_s32); } NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) -{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_u32(a)); } - +{ __riscv_vsse32((uint32_t*)ptr, stride * sizeof(uint32_t), a, npyv_nlanes_u32); } NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) -{ npyv_storen_s32((npy_int32*)ptr, stride, npyv_reinterpret_s32_f32(a)); } +{ __riscv_vsse32((float*)ptr, stride * sizeof(float), a, npyv_nlanes_f32); } NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) -{ - vst1q_lane_s64((int64_t*)ptr, a, 0); - vst1q_lane_s64((int64_t*)ptr + stride, a, 1); -} - +{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int64_t), a, npyv_nlanes_s64); } NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) -{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_u64(a)); } - +{ __riscv_vsse64((uint64_t*)ptr, stride * sizeof(uint64_t), a, npyv_nlanes_u64); } NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) -{ npyv_storen_s64((npy_int64*)ptr, stride, npyv_reinterpret_s64_f64(a)); } - +{ __riscv_vsse64((double*)ptr, stride * sizeof(double), a, npyv_nlanes_f64); } //// 64-bit store over 32-bit stride NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) -{ - vst1q_lane_u64((uint64_t*)ptr, npyv_reinterpret_u64_u32(a), 0); - vst1q_lane_u64((uint64_t*)(ptr + stride), npyv_reinterpret_u64_u32(a), 1); -} +{ __riscv_vsse64((uint64_t*)ptr, stride * sizeof(uint32_t), __riscv_vreinterpret_v_u32m1_u64m1(a), npyv_nlanes_u64); } NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) { npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } - NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) { npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); } //// 128-bit store over 64-bit stride NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) -{ (void)stride; npyv_store_u64(ptr, a); } - +{ + vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); + id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); + __riscv_vsoxei64((uint64_t*)ptr, id, a, npyv_nlanes_u64); +} NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) -{ (void)stride; npyv_store_s64(ptr, a); } - +{ npyv_storen2_u64((npy_uint64*)ptr, stride, npyv_reinterpret_u64_s64(a)); } NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) -{ (void)stride; npyv_store_f64(ptr, a); } +{ npyv_storen2_u64((npy_uint64*)ptr, stride, npyv_reinterpret_u64_f64(a)); } /********************************* * Partial Load *********************************/ //// 32 NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) -{ - assert(nlane > 0); - npyv_s32 a; - switch(nlane) { - case 1: - a = vld1q_lane_s32((const int32_t*)ptr, __riscv_vmv_v_x_i32m1(fill, 4), 0); - break; - case 2: - a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(fill, 2), 2, 4); - break; - case 3: - a = __riscv_vslideup_vx_i32m1( - __riscv_vle32_v_i32m1((const int32_t*)ptr, 2), - vld1_lane_s32((const int32_t*)ptr + 2, __riscv_vmv_v_x_i32m1(fill, 2), 0), 2, 4 - ); - break; - default: - return npyv_load_s32(ptr); - } -#if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s32 workaround = a; - a = __riscv_vor_vv_i32m1(workaround, a, 4); -#endif - return a; -} - +{ return __riscv_vle32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(fill, npyv_nlanes_s32), (const int32_t*)ptr, nlane); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { return npyv_load_till_s32(ptr, nlane, 0); } NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) -{ - assert(nlane > 0); - if (nlane == 1) { - npyv_s64 a = __riscv_vslideup_vx_i64m1(__riscv_vle64_v_i64m1((const int64_t*)ptr, 1), __riscv_vmv_v_x_i64m1(fill, 1), 1, 2); - #if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s64 workaround = a; - a = __riscv_vor_vv_i64m1(workaround, a, 2); - #endif - return a; - } - return npyv_load_s64(ptr); -} - +{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(fill, npyv_nlanes_s64), (const int64_t*)ptr, nlane); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) { return npyv_load_till_s64(ptr, nlane, 0); } @@ -541,19 +317,7 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) //// 64-bit nlane NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) -{ - assert(nlane > 0); - if (nlane == 1) { - const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; - npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vle32_v_i32m1(fill, 2), 2, 4); - #if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s32 workaround = a; - a = __riscv_vor_vv_i32m1(workaround, a, 4); - #endif - return a; - } - return npyv_load_s32(ptr); -} +{ return __riscv_vreinterpret_v_i64m1_i32m1(npyv_load_till_s64((const npy_int64*)ptr, nlane, (uint64_t)fill_hi << 32 | fill_lo)); } // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) { return __riscv_vreinterpret_v_i64m1_i32m1(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } @@ -561,49 +325,23 @@ NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) //// 128-bit nlane NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill_lo, npy_int64 fill_hi) -{ (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } - +{ + const vint64m1_t fill = __riscv_vmerge(__riscv_vmv_v_x_i64m1(fill_lo, npyv_nlanes_s64), fill_hi, __riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), npyv_nlanes_s64); + return __riscv_vle64_v_i64m1_tu(fill, (const int64_t*)ptr, nlane * 2); +} NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) -{ (void)nlane; return npyv_load_s64(ptr); } +{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(0, npyv_nlanes_s64), (const int64_t*)ptr, nlane * 2); } /********************************* * Non-contiguous partial load *********************************/ -NPY_FINLINE npyv_s32 -npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) -{ - assert(nlane > 0); - npyv_s32 vfill = __riscv_vmv_v_x_i32m1(fill, 4); - switch(nlane) { - case 3: - vfill = vld1q_lane_s32((const int32_t*)ptr + stride*2, vfill, 2); - case 2: - vfill = vld1q_lane_s32((const int32_t*)ptr + stride, vfill, 1); - case 1: - vfill = vld1q_lane_s32((const int32_t*)ptr, vfill, 0); - break; - default: - return npyv_loadn_s32(ptr, stride); - } -#if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s32 workaround = vfill; - vfill = __riscv_vor_vv_i32m1(workaround, vfill, 4); -#endif - return vfill; -} - +NPY_FINLINE npyv_s32 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ return __riscv_vlse32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(fill, npyv_nlanes_s32), (const int32_t*)ptr, stride * sizeof(int32_t), nlane); } NPY_FINLINE npyv_s32 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s32(ptr, stride, nlane, 0); } NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) -{ - assert(nlane > 0); - if (nlane == 1) { - return npyv_load_till_s64(ptr, 1, fill); - } - return npyv_loadn_s64(ptr, stride); -} - +{ return __riscv_vlse64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(fill, npyv_nlanes_s64), (const int64_t*)ptr, stride * sizeof(int64_t), nlane); } // fill zero to rest lanes NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn_till_s64(ptr, stride, nlane, 0); } @@ -611,148 +349,69 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, //// 64-bit load over 32-bit stride NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) -{ - assert(nlane > 0); - if (nlane == 1) { - const int32_t NPY_DECL_ALIGNED(16) fill[2] = {fill_lo, fill_hi}; - npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vle32_v_i32m1(fill, 2), 2, 4); - #if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s32 workaround = a; - a = __riscv_vor_vv_i32m1(workaround, a, 4); - #endif - return a; - } - return npyv_loadn2_s32(ptr, stride); -} - +{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vlse64_v_i64m1_tu(__riscv_vmv_v_x_i64m1((uint64_t)fill_hi << 32 | fill_lo, npyv_nlanes_s64), (const int64_t*)ptr, stride * sizeof(int32_t), nlane)); } NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) -{ - assert(nlane > 0); - if (nlane == 1) { - npyv_s32 a = __riscv_vslideup_vx_i32m1(__riscv_vle32_v_i32m1((const int32_t*)ptr, 2), __riscv_vmv_v_x_i32m1(0, 2), 2, 4); - #if NPY_SIMD_GUARD_PARTIAL_LOAD - volatile npyv_s32 workaround = a; - a = __riscv_vor_vv_i32m1(workaround, a, 4); - #endif - return a; - } - return npyv_loadn2_s32(ptr, stride); -} +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } //// 128-bit load over 64-bit stride NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill_lo, npy_int64 fill_hi) -{ assert(nlane > 0); (void)stride; (void)nlane; (void)fill_lo; (void)fill_hi; return npyv_load_s64(ptr); } - +{ + vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)); + vint64m1_t fill = __riscv_vmerge(__riscv_vmv_v_x_i64m1(fill_lo, npyv_nlanes_s64), fill_hi, mask, npyv_nlanes_s64); + vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); + id = __riscv_vadd_vx_u64m1_mu(mask, id, id, sizeof(uint64_t), npyv_nlanes_u64); + return __riscv_vloxei64_v_i64m1_tu(fill, (const int64_t*)ptr, id, nlane * 2); +} NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) -{ assert(nlane > 0); (void)stride; (void)nlane; return npyv_load_s64(ptr); } +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } /********************************* * Partial store *********************************/ //// 32 NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) -{ - assert(nlane > 0); - switch(nlane) { - case 1: - vst1q_lane_s32((int32_t*)ptr, a, 0); - break; - case 2: - __riscv_vse32_v_i32m1((int32_t*)ptr, a, 2); - break; - case 3: - __riscv_vse32_v_i32m1((int32_t*)ptr, a, 2); - vst1q_lane_s32((int32_t*)ptr + 2, a, 2); - break; - default: - npyv_store_s32(ptr, a); - } -} +{ __riscv_vse32((int32_t*)ptr, a, nlane); } //// 64 NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) -{ - assert(nlane > 0); - if (nlane == 1) { - vst1q_lane_s64((int64_t*)ptr, a, 0); - return; - } - npyv_store_s64(ptr, a); -} +{ __riscv_vse64((int64_t*)ptr, a, nlane); } //// 64-bit nlane NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) -{ - assert(nlane > 0); - if (nlane == 1) { - // armhf strict to alignment, may cause bus error - vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); - return; - } - npyv_store_s32(ptr, a); -} +{ npyv_store_till_s64((npy_int64*)ptr, nlane, npyv_reinterpret_s64_s32(a)); } //// 128-bit nlane NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) -{ - assert(nlane > 0); (void)nlane; - npyv_store_s64(ptr, a); -} +{ npyv_store_till_s64((npy_int64*)ptr, nlane * 2, a); } /********************************* * Non-contiguous partial store *********************************/ //// 32 NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) -{ - assert(nlane > 0); - vst1q_lane_s32((int32_t*)ptr, a, 0); - switch(nlane) { - case 1: - return; - case 2: - vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - return; - case 3: - vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); - return; - default: - vst1q_lane_s32((int32_t*)ptr + stride, a, 1); - vst1q_lane_s32((int32_t*)ptr + stride*2, a, 2); - vst1q_lane_s32((int32_t*)ptr + stride*3, a, 3); - } -} +{ __riscv_vsse32((int32_t*)ptr, stride * sizeof(int32_t), a, nlane); } + //// 64 NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) -{ - assert(nlane > 0); - if (nlane == 1) { - vst1q_lane_s64((int64_t*)ptr, a, 0); - return; - } - npyv_storen_s64(ptr, stride, a); -} +{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int64_t), a, nlane); } //// 64-bit store over 32-bit stride NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) -{ - assert(nlane > 0); - vst1q_lane_s64((int64_t*)ptr, npyv_reinterpret_s64_s32(a), 0); - if (nlane > 1) { - vst1q_lane_s64((int64_t*)(ptr + stride), npyv_reinterpret_s64_s32(a), 1); - } -} +{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int32_t), __riscv_vreinterpret_v_i32m1_i64m1(a), nlane); } //// 128-bit store over 64-bit stride NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) -{ assert(nlane > 0); (void)stride; (void)nlane; npyv_store_s64(ptr, a); } +{ + vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); + id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); + __riscv_vsoxei64((int64_t*)ptr, id, a, nlane * 2); +} /***************************************************************** * Implement partial load/store for u32/f32/u64/f64... via casting *****************************************************************/ -#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ { \ @@ -813,9 +472,10 @@ NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u32, s32) NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f32, s32) NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u64, s64) NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f64, s64) +#undef NPYV_IMPL_RVV_REST_PARTIAL_TYPES // 128-bit/64-bit stride -#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ @@ -884,85 +544,64 @@ NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u32, s32) NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f32, s32) NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u64, s64) NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f64, s64) +#undef NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR /************************************************************ * de-interleave load / interleave contiguous store ************************************************************/ // two channels -#define NPYV_IMPL_RVV_MEM_INTERLEAVE(SFX) \ - NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX, npyv_##SFX); \ - NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX, npyv_##SFX); \ - NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ - const npyv_lanetype_##SFX *ptr \ - ) { \ - return npyv_unzip_##SFX( \ - npyv_load_##SFX(ptr), npyv_load_##SFX(ptr+npyv_nlanes_##SFX) \ - ); \ - } \ - NPY_FINLINE void npyv_store_##SFX##x2( \ - npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ - ) { \ - npyv_##SFX##x2 zip = npyv_zip_##SFX(v.val[0], v.val[1]); \ - npyv_store_##SFX(ptr, zip.val[0]); \ - npyv_store_##SFX(ptr + npyv_nlanes_##SFX, zip.val[1]); \ +#define NPYV_IMPL_RVV_MEM_INTERLEAVE(SFX, R_SFX, EEW) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + npyv__##SFX##x2 v = __riscv_vlseg2##EEW##_v_##R_SFX##m1x2( \ + ptr, npyv_nlanes_##SFX \ + ); \ + return (npyv_##SFX##x2){{ \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(v, 0), \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(v, 1) \ + }}; \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + __riscv_vsseg2##EEW( \ + ptr, \ + __riscv_vcreate_v_##R_SFX##m1x2(v.val[0], v.val[1]), \ + npyv_nlanes_##SFX \ + ); \ } -NPYV_IMPL_RVV_MEM_INTERLEAVE(u8) -NPYV_IMPL_RVV_MEM_INTERLEAVE(s8) -NPYV_IMPL_RVV_MEM_INTERLEAVE(u16) -NPYV_IMPL_RVV_MEM_INTERLEAVE(s16) -NPYV_IMPL_RVV_MEM_INTERLEAVE(u32) -NPYV_IMPL_RVV_MEM_INTERLEAVE(s32) -NPYV_IMPL_RVV_MEM_INTERLEAVE(u64) -NPYV_IMPL_RVV_MEM_INTERLEAVE(s64) -NPYV_IMPL_RVV_MEM_INTERLEAVE(f32) -NPYV_IMPL_RVV_MEM_INTERLEAVE(f64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u8, u8, e8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s8, i8, e8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u16, u16, e16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s16, i16, e16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u32, u32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s32, i32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u64, u64, e64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s64, i64, e64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f32, f32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f64, f64, e64) +#undef NPYV_IMPL_RVV_MEM_INTERLEAVE /********************************* * Lookup table *********************************/ // uses vector as indexes into a table // that contains 32 elements of uint32. -NPY_FINLINE npyv_u32 vcreate_u32(uint64_t a) { - return __riscv_vreinterpret_v_u64m1_u32m1( - __riscv_vreinterpret_v_i64m1_u64m1(__riscv_vreinterpret_v_u64m1_i64m1(__riscv_vmv_v_x_u64m1(a, 8)))); -} - NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) -{ - const unsigned i0 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 0, 4)); - const unsigned i1 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 1, 4)); - const unsigned i2 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 2, 4)); - const unsigned i3 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(idx, 3, 4)); - - npyv_u32 low = vcreate_u32(table[i0]); - low = vld1_lane_u32((const uint32_t*)table + i1, low, 1); - npyv_u32 high = vcreate_u32(table[i2]); - high = vld1_lane_u32((const uint32_t*)table + i3, high, 1); - return __riscv_vslideup_vx_u32m1(low, high, 2, 4); -} - +{ return __riscv_vloxei32_v_u32m1((const uint32_t*)table, __riscv_vmul(idx, sizeof(uint32_t), npyv_nlanes_u32), npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) { return npyv_reinterpret_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } - NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) { return npyv_reinterpret_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } // uses vector as indexes into a table // that contains 16 elements of uint64. NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) -{ - const unsigned i0 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(idx), 0, 4)); - const unsigned i1 = __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(idx), 2, 4)); - return __riscv_vslideup_vx_u64m1( - __riscv_vle64_v_u64m1((const uint64_t*)table + i0, 1), - __riscv_vle64_v_u64m1((const uint64_t*)table + i1, 1), 1, 2 - ); -} - +{ return __riscv_vloxei64_v_u64m1((const uint64_t*)table, __riscv_vmul(idx, sizeof(uint64_t), npyv_nlanes_u64), npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) { return npyv_reinterpret_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } - NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) { return npyv_reinterpret_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } diff --git a/numpy/_core/src/common/simd/rvv/misc.h b/numpy/_core/src/common/simd/rvv/misc.h index 223471ee8f74..90b913df5c84 100644 --- a/numpy/_core/src/common/simd/rvv/misc.h +++ b/numpy/_core/src/common/simd/rvv/misc.h @@ -5,161 +5,105 @@ #ifndef _NPY_SIMD_RVV_MISC_H #define _NPY_SIMD_RVV_MISC_H +#include "conversion.h" + // vector with zero lanes #define npyv_zero_u8() __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) #define npyv_zero_s8() __riscv_vreinterpret_v_i32m1_i8m1(npyv_zero_s32()) #define npyv_zero_u16() __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) #define npyv_zero_s16() __riscv_vreinterpret_v_i32m1_i16m1(npyv_zero_s32()) -#define npyv_zero_u32() __riscv_vmv_v_x_u32m1((uint32_t)0, 4) -#define npyv_zero_s32() __riscv_vmv_v_x_i32m1((int32_t)0, 4) +#define npyv_zero_u32() __riscv_vmv_v_x_u32m1((uint32_t)0, npyv_nlanes_u32) +#define npyv_zero_s32() __riscv_vmv_v_x_i32m1((int32_t)0, npyv_nlanes_s32) #define npyv_zero_u64() __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) #define npyv_zero_s64() __riscv_vreinterpret_v_i32m1_i64m1(npyv_zero_s32()) -#define npyv_zero_f32() __riscv_vfmv_v_f_f32m1(0.0f, 4) -#define npyv_zero_f64() __riscv_vfmv_v_f_f64m1(0.0, 2) +#define npyv_zero_f32() __riscv_vfmv_v_f_f32m1(0.0f, npyv_nlanes_f32) +#define npyv_zero_f64() __riscv_vfmv_v_f_f64m1(0.0, npyv_nlanes_f64) // vector with a specific value set to all lanes NPY_FINLINE npyv_u8 npyv_setall_u8(uint8_t val) -{ - size_t vlen = __riscv_vsetvlmax_e8m1(); - return __riscv_vmv_v_x_u8m1(val, vlen); -} - +{ return __riscv_vmv_v_x_u8m1(val, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_setall_s8(int8_t val) -{ - size_t vlen = __riscv_vsetvlmax_e8m1(); - return __riscv_vmv_v_x_i8m1(val, vlen); -} - +{ return __riscv_vmv_v_x_i8m1(val, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_setall_u16(uint16_t val) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vmv_v_x_u16m1(val, vlen); -} - +{ return __riscv_vmv_v_x_u16m1(val, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_setall_s16(int16_t val) -{ - size_t vlen = __riscv_vsetvlmax_e16m1(); - return __riscv_vmv_v_x_i16m1(val, vlen); -} - +{ return __riscv_vmv_v_x_i16m1(val, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_setall_u32(uint32_t val) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vmv_v_x_u32m1(val, vlen); -} - +{ return __riscv_vmv_v_x_u32m1(val, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_setall_s32(int32_t val) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vmv_v_x_i32m1(val, vlen); -} - +{ return __riscv_vmv_v_x_i32m1(val, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_setall_u64(uint64_t val) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_v_x_u64m1(val, vlen); -} - +{ return __riscv_vmv_v_x_u64m1(val, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_setall_s64(int64_t val) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vmv_v_x_i64m1(val, vlen); -} - +{ return __riscv_vmv_v_x_i64m1(val, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_setall_f32(float val) -{ - size_t vlen = __riscv_vsetvlmax_e32m1(); - return __riscv_vfmv_v_f_f32m1(val, vlen); -} - +{ return __riscv_vfmv_v_f_f32m1(val, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_setall_f64(double val) -{ - size_t vlen = __riscv_vsetvlmax_e64m1(); - return __riscv_vfmv_v_f_f64m1(val, vlen); -} +{ return __riscv_vfmv_v_f_f64m1(val, npyv_nlanes_f64); } // vector with specific values set to each lane and -// set a specific value to all remained lanes -NPY_FINLINE npyv_u8 npyv__set_u8(npy_uint8 i0, npy_uint8 i1, npy_uint8 i2, npy_uint8 i3, - npy_uint8 i4, npy_uint8 i5, npy_uint8 i6, npy_uint8 i7, npy_uint8 i8, npy_uint8 i9, - npy_uint8 i10, npy_uint8 i11, npy_uint8 i12, npy_uint8 i13, npy_uint8 i14, npy_uint8 i15) -{ - const uint8_t NPY_DECL_ALIGNED(16) data[16] = { - i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 - }; - return __riscv_vle8_v_u8m1(data, 16); -} - -NPY_FINLINE npyv_s8 npyv__set_s8(npy_int8 i0, npy_int8 i1, npy_int8 i2, npy_int8 i3, - npy_int8 i4, npy_int8 i5, npy_int8 i6, npy_int8 i7, npy_int8 i8, npy_int8 i9, - npy_int8 i10, npy_int8 i11, npy_int8 i12, npy_int8 i13, npy_int8 i14, npy_int8 i15) -{ - const int8_t NPY_DECL_ALIGNED(16) data[16] = { - i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15 - }; - return __riscv_vle8_v_i8m1(data, 16); -} - -NPY_FINLINE vuint16m1_t npyv__set_u16(npy_uint16 i0, npy_uint16 i1, npy_uint16 i2, npy_uint16 i3, - npy_uint16 i4, npy_uint16 i5, npy_uint16 i6, npy_uint16 i7) -{ - const uint16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return __riscv_vle16_v_u16m1(data, 8); -} - -NPY_FINLINE vint16m1_t npyv__set_s16(npy_int16 i0, npy_int16 i1, npy_int16 i2, npy_int16 i3, - npy_int16 i4, npy_int16 i5, npy_int16 i6, npy_int16 i7) -{ - const int16_t NPY_DECL_ALIGNED(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - return __riscv_vle16_v_i16m1(data, 8); -} - -NPY_FINLINE npyv_u32 npyv__set_u32(npy_uint32 i0, npy_uint32 i1, npy_uint32 i2, npy_uint32 i3) -{ - const uint32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return __riscv_vle32_v_u32m1(data, 4); -} - -NPY_FINLINE vint32m1_t npyv__set_s32(npy_int32 i0, npy_int32 i1, npy_int32 i2, npy_int32 i3) -{ - const int32_t NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return __riscv_vle32_v_i32m1(data, 4); -} - -NPY_FINLINE npyv_u64 npyv__set_u64(npy_uint64 i0, npy_uint64 i1) -{ - const uint64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return __riscv_vle64_v_u64m1(data, 2); -} - -NPY_FINLINE vint64m1_t npyv__set_s64(npy_int64 i0, npy_int64 i1) -{ - const int64_t NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return __riscv_vle64_v_i64m1(data, 2); -} - -NPY_FINLINE vfloat32m1_t npyv__set_f32(float i0, float i1, float i2, float i3) -{ - const float NPY_DECL_ALIGNED(16) data[4] = {i0, i1, i2, i3}; - return __riscv_vle32_v_f32m1(data, 4); -} - -NPY_FINLINE vfloat64m1_t npyv__set_f64(double i0, double i1) -{ - const double NPY_DECL_ALIGNED(16) data[2] = {i0, i1}; - return __riscv_vle64_v_f64m1(data, 2); -} - -#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPYV__SET_FILL_16(npy_uint8, FILL, __VA_ARGS__)) -#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPYV__SET_FILL_16(npy_int8, FILL, __VA_ARGS__)) -#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPYV__SET_FILL_8(npy_uint16, FILL, __VA_ARGS__)) -#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPYV__SET_FILL_8(npy_int16, FILL, __VA_ARGS__)) -#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPYV__SET_FILL_4(npy_uint32, FILL, __VA_ARGS__)) -#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPYV__SET_FILL_4(npy_int32, FILL, __VA_ARGS__)) -#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPYV__SET_FILL_2(npy_uint64, FILL, __VA_ARGS__)) -#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPYV__SET_FILL_2(npy_int64, FILL, __VA_ARGS__)) -#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPYV__SET_FILL_4(float, FILL, __VA_ARGS__)) -#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPYV__SET_FILL_2(double, FILL, __VA_ARGS__)) +// set zero to all remained lanes +#define npyv__set_u8(...) \ + ({ \ + const uint8_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u8] = { __VA_ARGS__ }; \ + __riscv_vle8_v_u8m1(v, npyv_nlanes_u8); \ + }) +#define npyv__set_s8(...) \ + ({ \ + const int8_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s8] = { __VA_ARGS__ }; \ + __riscv_vle8_v_i8m1(v, npyv_nlanes_s8); \ + }) +#define npyv__set_u16(...) \ + ({ \ + const uint16_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u16] = { __VA_ARGS__ }; \ + __riscv_vle16_v_u16m1(v, npyv_nlanes_u16); \ + }) +#define npyv__set_s16(...) \ + ({ \ + const int16_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s16] = { __VA_ARGS__ }; \ + __riscv_vle16_v_i16m1(v, npyv_nlanes_s16); \ + }) +#define npyv__set_u32(...) \ + ({ \ + const uint32_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_u32m1(v, npyv_nlanes_u32); \ + }) +#define npyv__set_s32(...) \ + ({ \ + const int32_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_i32m1(v, npyv_nlanes_s32); \ + }) +#define npyv__set_u64(...) \ + ({ \ + const uint64_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_u64m1(v, npyv_nlanes_u64); \ + }) +#define npyv__set_s64(...) \ + ({ \ + const int64_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_i64m1(v, npyv_nlanes_s64); \ + }) +#define npyv__set_f32(...) \ + ({ \ + const float NPY_DECL_ALIGNED(16) v[npyv_nlanes_f32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_f32m1(v, npyv_nlanes_f32); \ + }) +#define npyv__set_f64(...) \ + ({ \ + const double NPY_DECL_ALIGNED(16) v[npyv_nlanes_f64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_f64m1(v, npyv_nlanes_f64); \ + }) + +#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u8)(npy_uint8, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s8)(npy_int8, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u16)(npy_uint16, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s16)(npy_int16, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u32)(npy_uint32, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s32)(npy_int32, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u64)(npy_uint64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s64)(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_f32)(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_f64)(double, FILL, __VA_ARGS__)) // vector with specific values set to each lane and // set zero to all remained lanes @@ -175,577 +119,249 @@ NPY_FINLINE vfloat64m1_t npyv__set_f64(double i0, double i1) #define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) // Per lane select -NPY_FINLINE npyv_u8 npyv_select_u8(npyv_u8 a, npyv_u8 b, npyv_u8 c) -{ - return __riscv_vxor_vv_u8m1(__riscv_vand_vv_u8m1(__riscv_vxor_vv_u8m1(c, b, 16), a, 16), c, 16); -} - -NPY_FINLINE npyv_s8 npyv_select_s8(npyv_u8 a, npyv_s8 b, npyv_s8 c) -{ - return __riscv_vxor_vv_i8m1( - __riscv_vand_vv_i8m1(__riscv_vxor_vv_i8m1(c, b, 16), __riscv_vreinterpret_v_u8m1_i8m1(a), 16), c, 16); -} - -NPY_FINLINE npyv_u16 npyv_select_u16(npyv_u16 a, npyv_u16 b, npyv_u16 c) -{ - return __riscv_vxor_vv_u16m1(__riscv_vand_vv_u16m1(__riscv_vxor_vv_u16m1(c, b, 8), a, 8), c, 8); -} - -NPY_FINLINE npyv_s16 npyv_select_s16(npyv_u16 a, npyv_s16 b, npyv_s16 c) -{ - return __riscv_vxor_vv_i16m1( - __riscv_vand_vv_i16m1(__riscv_vxor_vv_i16m1(c, b, 8), __riscv_vreinterpret_v_u16m1_i16m1(a), 8), c, 8); -} - -NPY_FINLINE npyv_u32 npyv_select_u32(npyv_u32 a, npyv_u32 b, npyv_u32 c) -{ - return __riscv_vxor_vv_u32m1(__riscv_vand_vv_u32m1(__riscv_vxor_vv_u32m1(c, b, 4), a, 4), c, 4); -} - -NPY_FINLINE npyv_s32 npyv_select_s32(npyv_u32 a, npyv_s32 b, npyv_s32 c) -{ - return __riscv_vxor_vv_i32m1( - __riscv_vand_vv_i32m1(__riscv_vxor_vv_i32m1(c, b, 4), __riscv_vreinterpret_v_u32m1_i32m1(a), 4), c, 4); -} - -NPY_FINLINE npyv_u64 npyv_select_u64(npyv_u64 a, npyv_u64 b, npyv_u64 c) -{ - return __riscv_vxor_vv_u64m1(__riscv_vand_vv_u64m1(__riscv_vxor_vv_u64m1(c, b, 2), a, 2), c, 2); -} - -NPY_FINLINE npyv_s64 npyv_select_s64(npyv_u64 a, npyv_s64 b, npyv_s64 c) -{ - return __riscv_vxor_vv_i64m1( - __riscv_vand_vv_i64m1(__riscv_vxor_vv_i64m1(c, b, 2), __riscv_vreinterpret_v_u64m1_i64m1(a), 2), c, 2); -} - -NPY_FINLINE npyv_f32 npyv_select_f32(npyv_u32 a, npyv_f32 b, npyv_f32 c) -{ - npyv_u32 b_u32 = __riscv_vreinterpret_v_f32m1_u32m1(b); - npyv_u32 c_u32 = __riscv_vreinterpret_v_f32m1_u32m1(c); - return __riscv_vreinterpret_v_u32m1_f32m1( - __riscv_vxor_vv_u32m1(__riscv_vand_vv_u32m1(__riscv_vxor_vv_u32m1(c_u32, b_u32, 4), a, 4), c_u32, 4)); -} - -NPY_FINLINE npyv_f64 npyv_select_f64(npyv_u64 a, npyv_f64 b, npyv_f64 c) -{ - npyv_u64 b_u64 = __riscv_vreinterpret_v_f64m1_u64m1(b); - npyv_u64 c_u64 = __riscv_vreinterpret_v_f64m1_u64m1(c); - return __riscv_vreinterpret_v_u64m1_f64m1( - __riscv_vxor_vv_u64m1(__riscv_vand_vv_u64m1(__riscv_vxor_vv_u64m1(c_u64, b_u64, 2), a, 2), c_u64, 2)); -} +NPY_FINLINE npyv_u8 npyv_select_u8(npyv_b8 a, npyv_u8 b, npyv_u8 c) +{ return __riscv_vmerge_vvm_u8m1(c, b, npyv__from_b8(a), npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_select_s8(npyv_b8 a, npyv_s8 b, npyv_s8 c) +{ return __riscv_vmerge_vvm_i8m1(c, b, npyv__from_b8(a), npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_select_u16(npyv_b16 a, npyv_u16 b, npyv_u16 c) +{ return __riscv_vmerge_vvm_u16m1(c, b, npyv__from_b16(a), npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_select_s16(npyv_b16 a, npyv_s16 b, npyv_s16 c) +{ return __riscv_vmerge_vvm_i16m1(c, b, npyv__from_b16(a), npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_select_u32(npyv_b32 a, npyv_u32 b, npyv_u32 c) +{ return __riscv_vmerge_vvm_u32m1(c, b, npyv__from_b32(a), npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_select_s32(npyv_b32 a, npyv_s32 b, npyv_s32 c) +{ return __riscv_vmerge_vvm_i32m1(c, b, npyv__from_b32(a), npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_select_u64(npyv_b64 a, npyv_u64 b, npyv_u64 c) +{ return __riscv_vmerge_vvm_u64m1(c, b, npyv__from_b64(a), npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_select_s64(npyv_b64 a, npyv_s64 b, npyv_s64 c) +{ return __riscv_vmerge_vvm_i64m1(c, b, npyv__from_b64(a), npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_select_f32(npyv_b32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vmerge_vvm_f32m1(c, b, npyv__from_b32(a), npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_select_f64(npyv_b64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vmerge_vvm_f64m1(c, b, npyv__from_b64(a), npyv_nlanes_f64); } // extract the first vector's lane NPY_FINLINE npy_uint8 npyv_extract0_u8(npyv_u8 a) -{ - return __riscv_vmv_x_s_u8m1_u8(__riscv_vslidedown_vx_u8m1(a, 0, 16)); -} - +{ return __riscv_vmv_x_s_u8m1_u8(a); } NPY_FINLINE npy_int8 npyv_extract0_s8(npyv_s8 a) -{ - return __riscv_vmv_x_s_i8m1_i8(__riscv_vslidedown_vx_i8m1(a, 0, 16)); -} - +{ return __riscv_vmv_x_s_i8m1_i8(a); } NPY_FINLINE npy_uint16 npyv_extract0_u16(npyv_u16 a) -{ - return __riscv_vmv_x_s_u16m1_u16(__riscv_vslidedown_vx_u16m1(a, 0, 8)); -} - +{ return __riscv_vmv_x_s_u16m1_u16(a); } NPY_FINLINE npy_int16 npyv_extract0_s16(npyv_s16 a) -{ - return __riscv_vmv_x_s_i16m1_i16(__riscv_vslidedown_vx_i16m1(a, 0, 8)); -} - +{ return __riscv_vmv_x_s_i16m1_i16(a); } NPY_FINLINE npy_uint32 npyv_extract0_u32(npyv_u32 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(a, 0, 4)); -} - +{ return __riscv_vmv_x_s_u32m1_u32(a); } NPY_FINLINE npy_int32 npyv_extract0_s32(npyv_s32 a) -{ - return __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(a, 0, 4)); -} - +{ return __riscv_vmv_x_s_i32m1_i32(a); } NPY_FINLINE npy_uint64 npyv_extract0_u64(npyv_u64 a) -{ - return __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(a, 0, 2)); -} - +{ return __riscv_vmv_x_s_u64m1_u64(a); } NPY_FINLINE npy_int64 npyv_extract0_s64(npyv_s64 a) -{ - return __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(a, 0, 2)); -} - +{ return __riscv_vmv_x_s_i64m1_i64(a); } NPY_FINLINE float npyv_extract0_f32(npyv_f32 a) -{ - return __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(a, 0, 4)); -} - +{ return __riscv_vfmv_f_s_f32m1_f32(a); } NPY_FINLINE double npyv_extract0_f64(npyv_f64 a) -{ - return __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(a, 0, 2)); -} +{ return __riscv_vfmv_f_s_f64m1_f64(a); } // Reinterpret #define npyv_reinterpret_u8_u8(X) X NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i8m1_u8m1(a); -} - +{ return __riscv_vreinterpret_v_i8m1_u8m1(a); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u16m1_u8m1(a); -} - +{ return __riscv_vreinterpret_v_u16m1_u8m1(a); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_u16m1_u8m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); -} - +{ return __riscv_vreinterpret_v_u16m1_u8m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u32m1_u8m1(a); -} - +{ return __riscv_vreinterpret_v_u32m1_u8m1(a); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u64m1_u8m1(a); -} - +{ return __riscv_vreinterpret_v_u64m1_u8m1(a); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); -} +{ return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } #define npyv_reinterpret_s8_s8(X) X NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u8m1_i8m1(a); -} - +{ return __riscv_vreinterpret_v_u8m1_i8m1(a); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_i16m1_i8m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); -} - +{ return __riscv_vreinterpret_v_i16m1_i8m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i16m1_i8m1(a); -} - +{ return __riscv_vreinterpret_v_i16m1_i8m1(a); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i32m1_i8m1(a); -} - +{ return __riscv_vreinterpret_v_i32m1_i8m1(a); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i64m1_i8m1(a); -} - +{ return __riscv_vreinterpret_v_i64m1_i8m1(a); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); -} +{ return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } #define npyv_reinterpret_u16_u16(X) X NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u8m1_u16m1(a); -} - +{ return __riscv_vreinterpret_v_u8m1_u16m1(a); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_u8m1_u16m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); -} - +{ return __riscv_vreinterpret_v_u8m1_u16m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i16m1_u16m1(a); -} - +{ return __riscv_vreinterpret_v_i16m1_u16m1(a); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u32m1_u16m1(a); -} - +{ return __riscv_vreinterpret_v_u32m1_u16m1(a); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u64m1_u16m1(a); -} - +{ return __riscv_vreinterpret_v_u64m1_u16m1(a); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); -} +{ return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } #define npyv_reinterpret_s16_s16(X) X NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_i8m1_i16m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); -} - +{ return __riscv_vreinterpret_v_i8m1_i16m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i8m1_i16m1(a); -} - +{ return __riscv_vreinterpret_v_i8m1_i16m1(a); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u16m1_i16m1(a); -} - +{ return __riscv_vreinterpret_v_u16m1_i16m1(a); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i32m1_i16m1(a); -} - +{ return __riscv_vreinterpret_v_i32m1_i16m1(a); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i64m1_i16m1(a); -} - +{ return __riscv_vreinterpret_v_i64m1_i16m1(a); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); -} +{ return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } #define npyv_reinterpret_u32_u32(X) X NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u8m1_u32m1(a); -} - +{ return __riscv_vreinterpret_v_u8m1_u32m1(a); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); -} - +{ return __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u16m1_u32m1(a); -} - +{ return __riscv_vreinterpret_v_u16m1_u32m1(a); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); -} - +{ return __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i32m1_u32m1(a); -} - +{ return __riscv_vreinterpret_v_i32m1_u32m1(a); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u64m1_u32m1(a); -} - +{ return __riscv_vreinterpret_v_u64m1_u32m1(a); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_f32m1_u32m1(a); -} - +{ return __riscv_vreinterpret_v_f32m1_u32m1(a); } NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); -} +{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } #define npyv_reinterpret_s32_s32(X) X NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_i8m1_i32m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); -} - +{ return __riscv_vreinterpret_v_i8m1_i32m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i8m1_i32m1(a); -} - +{ return __riscv_vreinterpret_v_i8m1_i32m1(a); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); -} - +{ return __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i16m1_i32m1(a); -} - +{ return __riscv_vreinterpret_v_i16m1_i32m1(a); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u32m1_i32m1(a); -} - +{ return __riscv_vreinterpret_v_u32m1_i32m1(a); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i64m1_i32m1(a); -} - +{ return __riscv_vreinterpret_v_i64m1_i32m1(a); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_f32m1_i32m1(a); -} - +{ return __riscv_vreinterpret_v_f32m1_i32m1(a); } NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); -} +{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } #define npyv_reinterpret_u64_u64(X) X NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u8m1_u64m1(a); -} - +{ return __riscv_vreinterpret_v_u8m1_u64m1(a); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_u8m1_u64m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); -} - +{ return __riscv_vreinterpret_v_u8m1_u64m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u16m1_u64m1(a); -} - +{ return __riscv_vreinterpret_v_u16m1_u64m1(a); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); -} - +{ return __riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u32m1_u64m1(a); -} - +{ return __riscv_vreinterpret_v_u32m1_u64m1(a); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i64m1_u64m1(a); -} - +{ return __riscv_vreinterpret_v_i64m1_u64m1(a); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_f64m1_u64m1(a); -} +{ return __riscv_vreinterpret_v_f64m1_u64m1(a); } #define npyv_reinterpret_s64_s64(X) X NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_i8m1_i64m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); -} - +{ return __riscv_vreinterpret_v_i8m1_i64m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i8m1_i64m1(a); -} - +{ return __riscv_vreinterpret_v_i8m1_i64m1(a); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_i16m1_i64m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); -} - +{ return __riscv_vreinterpret_v_i16m1_i64m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i16m1_i64m1(a); -} - +{ return __riscv_vreinterpret_v_i16m1_i64m1(a); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i32m1_i64m1(a); -} - +{ return __riscv_vreinterpret_v_i32m1_i64m1(a); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u64m1_i64m1(a); -} - +{ return __riscv_vreinterpret_v_u64m1_i64m1(a); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_f64m1_i64m1(a); -} +{ return __riscv_vreinterpret_v_f64m1_i64m1(a); } #define npyv_reinterpret_f32_f32(X) X NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u32m1_f32m1(a); -} - +{ return __riscv_vreinterpret_v_u32m1_f32m1(a); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i32m1_f32m1(a); -} - +{ return __riscv_vreinterpret_v_i32m1_f32m1(a); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(a)); -} - +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(a)); -} - +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(a)); } NPY_FINLINE npyv_f32 npyv_reinterpret_f32_f64(npyv_f64 a) -{ - return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a))); -} +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a))); } #define npyv_reinterpret_f64_f64(X) X NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u8(npyv_u8 a) -{ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u8m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u8m1_u64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s8(npyv_s8 a) -{ - return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i8m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i8m1_i64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u16(npyv_u16 a) -{ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s16(npyv_s16 a) -{ - return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i16m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i16m1_i64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u32(npyv_u32 a) -{ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(a)); -} - +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s32(npyv_s32 a) -{ - return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(a)); -} - +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(a)); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u64(npyv_u64 a) -{ - return __riscv_vreinterpret_v_u64m1_f64m1(a); -} - +{ return __riscv_vreinterpret_v_u64m1_f64m1(a); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_i64m1_f64m1(a); -} - +{ return __riscv_vreinterpret_v_i64m1_f64m1(a); } NPY_FINLINE npyv_f64 npyv_reinterpret_f64_f32(npyv_f32 a) -{ - return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a))); -} +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a))); } // Only required by AVX2/AVX512 #define npyv_cleanup() ((void)0) diff --git a/numpy/_core/src/common/simd/rvv/operators.h b/numpy/_core/src/common/simd/rvv/operators.h index a41e605093e0..03b3e516b5fa 100644 --- a/numpy/_core/src/common/simd/rvv/operators.h +++ b/numpy/_core/src/common/simd/rvv/operators.h @@ -10,244 +10,80 @@ ***************************/ // left NPY_FINLINE npyv_u16 npyv_shl_u16(npyv_u16 a, int16_t c) -{ - npyv_s16 b = npyv_setall_s16(c); - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); -} - +{ return __riscv_vsll_vx_u16m1(a, c, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_shl_s16(npyv_s16 a, int16_t c) -{ - npyv_s16 b = npyv_setall_s16(c); - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); -} - +{ return __riscv_vsll_vx_i16m1(a, c, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_shl_u32(npyv_u32 a, int32_t c) -{ - npyv_s32 b = npyv_setall_s32(c); - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); -} - +{ return __riscv_vsll_vx_u32m1(a, c, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_shl_s32(npyv_s32 a, int32_t c) -{ - npyv_s32 b = npyv_setall_s32(c); - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); -} - +{ return __riscv_vsll_vx_i32m1(a, c, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_shl_u64(npyv_u64 a, int64_t c) -{ - npyv_s64 b = npyv_setall_s64(c); - vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); - npyv_u64 shl = __riscv_vsll_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); - npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); - npyv_u64 shr = __riscv_vsrl_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); - return __riscv_vmerge_vvm_u64m1(shr, shl, positive_mask, 2); -} - +{ return __riscv_vsll_vx_u64m1(a, c, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_shl_s64(npyv_s64 a, int64_t c) -{ - npyv_s64 b = npyv_setall_s64(c); - vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); - npyv_s64 shl = __riscv_vsll_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); - npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); - npyv_s64 shr = __riscv_vsra_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); - return __riscv_vmerge_vvm_i64m1(shr, shl, positive_mask, 2); -} +{ return __riscv_vsll_vx_i64m1(a, c, npyv_nlanes_s64); } // left by an immediate constant NPY_FINLINE npyv_u16 npyv_shli_u16(npyv_u16 a, const int b) -{ - return __riscv_vsll_vx_u16m1(a, b, 8); -} - +{ return __riscv_vsll_vx_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_shli_s16(npyv_s16 a, const int b) -{ - return __riscv_vsll_vx_i16m1(a, b, 8); -} - +{ return __riscv_vsll_vx_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_shli_u32(npyv_u32 a, const int b) -{ - return __riscv_vsll_vx_u32m1(a, b, 4); -} - +{ return __riscv_vsll_vx_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_shli_s32(npyv_s32 a, const int b) -{ - return __riscv_vsll_vx_i32m1(a, b, 4); -} - +{ return __riscv_vsll_vx_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_shli_u64(npyv_u64 a, const int b) -{ - return __riscv_vsll_vx_u64m1(a, b, 2); -} - +{ return __riscv_vsll_vx_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_shli_s64(npyv_s64 a, const int b) -{ - return __riscv_vsll_vx_i64m1(a, b, 2); -} +{ return __riscv_vsll_vx_i64m1(a, b, npyv_nlanes_s64); } // right NPY_FINLINE npyv_u16 npyv_shr_u16(npyv_u16 a, int16_t c) -{ - npyv_s16 b = npyv_setall_s16(-(c)); - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_u16 shl = __riscv_vsll_vv_u16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vuint32m2_t a_ext = __riscv_vzext_vf2_u32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_u16 shr = __riscv_vnclipu_wv_u16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_u16m1(shr, shl, positive_mask, 8); -} - +{ return __riscv_vsrl_vx_u16m1(a, c, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_shr_s16(npyv_s16 a, int16_t c) -{ - npyv_s16 b = npyv_setall_s16(-(c)); - vbool16_t positive_mask = __riscv_vmsgt_vx_i16m1_b16(b, 0, 8); - npyv_s16 shl = __riscv_vsll_vv_i16m1(a, __riscv_vreinterpret_v_i16m1_u16m1(b), 8); - vint32m2_t a_ext = __riscv_vsext_vf2_i32m2(a, 8); - npyv_s16 b_neg = __riscv_vneg_v_i16m1(b, 8); - npyv_s16 shr = __riscv_vnclip_wv_i16m1(a_ext, __riscv_vreinterpret_v_i16m1_u16m1(b_neg), __RISCV_VXRM_RDN, 8); - return __riscv_vmerge_vvm_i16m1(shr, shl, positive_mask, 8); -} - +{ return __riscv_vsra_vx_i16m1(a, c, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_shr_u32(npyv_u32 a, int32_t c) -{ - npyv_s32 b = npyv_setall_s32(-(c)); - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_u32 shl = __riscv_vsll_vv_u32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vuint64m2_t a_ext = __riscv_vzext_vf2_u64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_u32 shr = __riscv_vnclipu_wv_u32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_u32m1(shr, shl, positive_mask, 4); -} - +{ return __riscv_vsrl_vx_u32m1(a, c, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_shr_s32(npyv_s32 a, int32_t c) -{ - npyv_s32 b = npyv_setall_s32(-(c)); - vbool32_t positive_mask = __riscv_vmsgt_vx_i32m1_b32(b, 0, 4); - npyv_s32 shl = __riscv_vsll_vv_i32m1(a, __riscv_vreinterpret_v_i32m1_u32m1(b), 4); - vint64m2_t a_ext = __riscv_vsext_vf2_i64m2(a, 4); - npyv_s32 b_neg = __riscv_vneg_v_i32m1(b, 4); - npyv_s32 shr = __riscv_vnclip_wv_i32m1(a_ext, __riscv_vreinterpret_v_i32m1_u32m1(b_neg), __RISCV_VXRM_RDN, 4); - return __riscv_vmerge_vvm_i32m1(shr, shl, positive_mask, 4); -} - +{ return __riscv_vsra_vx_i32m1(a, c, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_shr_u64(npyv_u64 a, int64_t c) -{ - npyv_s64 b = npyv_setall_s64(-(c)); - // implementation only works within defined range 'b' in [0, 63] - vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); - npyv_u64 shl = __riscv_vsll_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); - npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); - npyv_u64 shr = __riscv_vsrl_vv_u64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); - return __riscv_vmerge_vvm_u64m1(shr, shl, positive_mask, 2); -} - +{ return __riscv_vsrl_vx_u64m1(a, c, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_shr_s64(npyv_s64 a, int64_t c) -{ - npyv_s64 b = npyv_setall_s64(-(c)); - // implementation only works within defined range 'b' in [0, 63] - vbool64_t positive_mask = __riscv_vmsgt_vx_i64m1_b64(b, 0, 2); - npyv_s64 shl = __riscv_vsll_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b), 2); - npyv_s64 b_neg = __riscv_vneg_v_i64m1(b, 2); - npyv_s64 shr = __riscv_vsra_vv_i64m1(a, __riscv_vreinterpret_v_i64m1_u64m1(b_neg), 2); - return __riscv_vmerge_vvm_i64m1(shr, shl, positive_mask, 2); -} +{ return __riscv_vsra_vx_i64m1(a, c, npyv_nlanes_s64); } // right by an immediate constant -NPY_FINLINE npyv_u16 npyv_shri_u16(npyv_u16 a, const int b) { - const int b_half = b >> 1; - npyv_u16 srl1 = __riscv_vsrl_vx_u16m1(a, b_half, 8); - return __riscv_vsrl_vx_u16m1(srl1, b_half + (b & 0x1), 8); -} - -NPY_FINLINE npyv_s16 npyv_shri_s16(npyv_s16 a, const int b) { - const int imm = b - (b >> 4); - return __riscv_vsra_vx_i16m1(a, imm, 8); -} - -NPY_FINLINE npyv_u32 npyv_shri_u32(npyv_u32 a, const int b) { - const int b_half = b >> 1; - npyv_u32 srl1 = __riscv_vsrl_vx_u32m1(a, b_half, 4); - return __riscv_vsrl_vx_u32m1(srl1, b_half + (b & 0x1), 4); -} - -NPY_FINLINE npyv_s32 npyv_shri_s32(npyv_s32 a, const int b) { - const int imm = b - (b >> 5); - return __riscv_vsra_vx_i32m1(a, imm, 4); -} - -NPY_FINLINE npyv_u64 npyv_shri_u64(npyv_u64 a, const int b) { - const int b_half = b >> 1; - npyv_u64 srl1 = __riscv_vsrl_vx_u64m1(a, b_half, 2); - return __riscv_vsrl_vx_u64m1(srl1, b_half + (b & 0x1), 2); -} - -NPY_FINLINE npyv_s64 npyv_shri_s64(npyv_s64 a, const int b) { - const int imm = b - (b >> 6); - return __riscv_vsra_vx_i64m1(a, imm, 2); -} +NPY_FINLINE npyv_u16 npyv_shri_u16(npyv_u16 a, const int b) +{ return __riscv_vsrl_vx_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_shri_s16(npyv_s16 a, const int b) +{ return __riscv_vsra_vx_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_shri_u32(npyv_u32 a, const int b) +{ return __riscv_vsrl_vx_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_shri_s32(npyv_s32 a, const int b) +{ return __riscv_vsra_vx_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_shri_u64(npyv_u64 a, const int b) +{ return __riscv_vsrl_vx_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_shri_s64(npyv_s64 a, const int b) +{ return __riscv_vsra_vx_i64m1(a, b, npyv_nlanes_s64); } /*************************** * Logical ***************************/ // AND -NPY_FINLINE npyv_u8 npyv_and_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vand_vv_u8m1(a, b, 16); -} - +NPY_FINLINE npyv_u8 npyv_and_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vand_vv_u8m1(a, b, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_and_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vand_vv_i8m1(a, b, 16); -} - +{ return __riscv_vand_vv_i8m1(a, b, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_and_u16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vand_vv_u16m1(a, b, 8); -} - +{ return __riscv_vand_vv_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_and_s16(npyv_s16 a, npyv_s16 b) -{ - return __riscv_vand_vv_i16m1(a, b, 8); -} - +{ return __riscv_vand_vv_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_and_u32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vand_vv_u32m1(a, b, 4); -} - +{ return __riscv_vand_vv_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_and_s32(npyv_s32 a, npyv_s32 b) -{ - return __riscv_vand_vv_i32m1(a, b, 4); -} - +{ return __riscv_vand_vv_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_and_u64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vand_vv_u64m1(a, b, 2); -} - +{ return __riscv_vand_vv_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_and_s64(npyv_s64 a, npyv_s64 b) -{ - return __riscv_vand_vv_i64m1(a, b, 2); -} +{ return __riscv_vand_vv_i64m1(a, b, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_and_f32(npyv_f32 a, npyv_f32 b) { @@ -255,82 +91,43 @@ NPY_FINLINE npyv_f32 npyv_and_f32(npyv_f32 a, npyv_f32 b) __riscv_vand_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), - 4 + npyv_nlanes_f32 ) ); } - NPY_FINLINE npyv_f64 npyv_and_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vreinterpret_v_u64m1_f64m1( __riscv_vand_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), - 2 + npyv_nlanes_f64 ) ); } -NPY_FINLINE npyv_u8 npyv_and_b8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vand_vv_u8m1(a, b, 16); -} - -NPY_FINLINE npyv_u16 npyv_and_b16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vand_vv_u16m1(a, b, 8); -} - -NPY_FINLINE npyv_u32 npyv_and_b32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vand_vv_u32m1(a, b, 4); -} - -NPY_FINLINE npyv_u64 npyv_and_b64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vand_vv_u64m1(a, b, 2); -} +#define npyv_and_b8 npyv_and_u8 +#define npyv_and_b16 npyv_and_u16 +#define npyv_and_b32 npyv_and_u32 +#define npyv_and_b64 npyv_and_u64 // OR NPY_FINLINE npyv_u8 npyv_or_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vor_vv_u8m1(a, b, 16); -} - +{ return __riscv_vor_vv_u8m1(a, b, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_or_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vor_vv_i8m1(a, b, 16); -} - +{ return __riscv_vor_vv_i8m1(a, b, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_or_u16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vor_vv_u16m1(a, b, 8); -} - +{ return __riscv_vor_vv_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_or_s16(npyv_s16 a, npyv_s16 b) -{ - return __riscv_vor_vv_i16m1(a, b, 8); -} - +{ return __riscv_vor_vv_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_or_u32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vor_vv_u32m1(a, b, 4); -} - +{ return __riscv_vor_vv_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_or_s32(npyv_s32 a, npyv_s32 b) -{ - return __riscv_vor_vv_i32m1(a, b, 4); -} - +{ return __riscv_vor_vv_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_or_u64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vor_vv_u64m1(a, b, 2); -} - +{ return __riscv_vor_vv_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_or_s64(npyv_s64 a, npyv_s64 b) -{ - return __riscv_vor_vv_i64m1(a, b, 2); -} +{ return __riscv_vor_vv_i64m1(a, b, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_or_f32(npyv_f32 a, npyv_f32 b) { @@ -338,82 +135,43 @@ NPY_FINLINE npyv_f32 npyv_or_f32(npyv_f32 a, npyv_f32 b) __riscv_vor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), - 4 + npyv_nlanes_f32 ) ); } - NPY_FINLINE npyv_f64 npyv_or_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vreinterpret_v_u64m1_f64m1( __riscv_vor_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), - 2 + npyv_nlanes_f64 ) ); } -NPY_FINLINE npyv_u8 npyv_or_b8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vor_vv_u8m1(a, b, 16); -} - -NPY_FINLINE npyv_u16 npyv_or_b16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vor_vv_u16m1(a, b, 8); -} - -NPY_FINLINE npyv_u32 npyv_or_b32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vor_vv_u32m1(a, b, 4); -} - -NPY_FINLINE npyv_u64 npyv_or_b64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vor_vv_u64m1(a, b, 2); -} +#define npyv_or_b8 npyv_or_u8 +#define npyv_or_b16 npyv_or_u16 +#define npyv_or_b32 npyv_or_u32 +#define npyv_or_b64 npyv_or_u64 // XOR NPY_FINLINE npyv_u8 npyv_xor_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vxor_vv_u8m1(a, b, 16); -} - +{ return __riscv_vxor_vv_u8m1(a, b, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_xor_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vxor_vv_i8m1(a, b, 16); -} - +{ return __riscv_vxor_vv_i8m1(a, b, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_xor_u16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vxor_vv_u16m1(a, b, 8); -} - +{ return __riscv_vxor_vv_u16m1(a, b, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_xor_s16(npyv_s16 a, npyv_s16 b) -{ - return __riscv_vxor_vv_i16m1(a, b, 8); -} - +{ return __riscv_vxor_vv_i16m1(a, b, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_xor_u32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vxor_vv_u32m1(a, b, 4); -} - +{ return __riscv_vxor_vv_u32m1(a, b, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_xor_s32(npyv_s32 a, npyv_s32 b) -{ - return __riscv_vxor_vv_i32m1(a, b, 4); -} - +{ return __riscv_vxor_vv_i32m1(a, b, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_xor_u64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vxor_vv_u64m1(a, b, 2); -} - +{ return __riscv_vxor_vv_u64m1(a, b, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_xor_s64(npyv_s64 a, npyv_s64 b) -{ - return __riscv_vxor_vv_i64m1(a, b, 2); -} +{ return __riscv_vxor_vv_i64m1(a, b, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_xor_f32(npyv_f32 a, npyv_f32 b) { @@ -421,339 +179,168 @@ NPY_FINLINE npyv_f32 npyv_xor_f32(npyv_f32 a, npyv_f32 b) __riscv_vxor_vv_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), __riscv_vreinterpret_v_f32m1_u32m1(b), - 4 + npyv_nlanes_f32 ) ); } - NPY_FINLINE npyv_f64 npyv_xor_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vreinterpret_v_u64m1_f64m1( __riscv_vxor_vv_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), __riscv_vreinterpret_v_f64m1_u64m1(b), - 2 + npyv_nlanes_f64 ) ); } -NPY_FINLINE npyv_u8 npyv_xor_b8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vxor_vv_u8m1(a, b, 16); -} - -NPY_FINLINE npyv_u16 npyv_xor_b16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vxor_vv_u16m1(a, b, 8); -} - -NPY_FINLINE npyv_u32 npyv_xor_b32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vxor_vv_u32m1(a, b, 4); -} - -NPY_FINLINE npyv_u64 npyv_xor_b64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vxor_vv_u64m1(a, b, 2); -} +#define npyv_xor_b8 npyv_xor_u8 +#define npyv_xor_b16 npyv_xor_u16 +#define npyv_xor_b32 npyv_xor_u32 +#define npyv_xor_b64 npyv_xor_u64 // NOT NPY_FINLINE npyv_u8 npyv_not_u8(npyv_u8 a) -{ - return __riscv_vnot_v_u8m1(a, 16); -} - +{ return __riscv_vnot_v_u8m1(a, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_not_s8(npyv_s8 a) -{ - return __riscv_vnot_v_i8m1(a, 16); -} - +{ return __riscv_vnot_v_i8m1(a, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_not_u16(npyv_u16 a) -{ - return __riscv_vnot_v_u16m1(a, 8); -} - +{ return __riscv_vnot_v_u16m1(a, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_not_s16(npyv_s16 a) -{ - return __riscv_vnot_v_i16m1(a, 8); -} - +{ return __riscv_vnot_v_i16m1(a, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_not_u32(npyv_u32 a) -{ - return __riscv_vnot_v_u32m1(a, 4); -} - +{ return __riscv_vnot_v_u32m1(a, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_not_s32(npyv_s32 a) -{ - return __riscv_vnot_v_i32m1(a, 4); -} - +{ return __riscv_vnot_v_i32m1(a, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a) -{ - return __riscv_vnot_v_u64m1(a, 2); -} - +{ return __riscv_vnot_v_u64m1(a, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_not_s64(npyv_s64 a) -{ - return __riscv_vreinterpret_v_u64m1_i64m1( - __riscv_vnot_v_u64m1( - __riscv_vreinterpret_v_i64m1_u64m1(a), - 2 - ) - ); -} +{ return __riscv_vnot_v_i64m1(a, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) { return __riscv_vreinterpret_v_u32m1_f32m1( __riscv_vnot_v_u32m1( __riscv_vreinterpret_v_f32m1_u32m1(a), - 4 + npyv_nlanes_f32 ) ); } - NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) { return __riscv_vreinterpret_v_u64m1_f64m1( __riscv_vnot_v_u64m1( __riscv_vreinterpret_v_f64m1_u64m1(a), - 2 + npyv_nlanes_f64 ) ); } -NPY_FINLINE npyv_u8 npyv_not_b8(npyv_u8 a) -{ - return __riscv_vnot_v_u8m1(a, 16); -} - -NPY_FINLINE npyv_u16 npyv_not_b16(npyv_u16 a) -{ - return __riscv_vnot_v_u16m1(a, 8); -} - -NPY_FINLINE npyv_u32 npyv_not_b32(npyv_u32 a) -{ - return __riscv_vnot_v_u32m1(a, 4); -} - -#define npyv_not_b64 npyv_not_u64 +#define npyv_not_b8 npyv_not_u8 +#define npyv_not_b16 npyv_not_u16 +#define npyv_not_b32 npyv_not_u32 +#define npyv_not_b64 npyv_not_u64 // ANDC, ORC and XNOR -NPY_FINLINE npyv_u8 npyv_andc_u8(npyv_u8 a, npyv_u8 b) { - return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); -} - -NPY_FINLINE npyv_u8 npyv_andc_b8(npyv_u8 a, npyv_u8 b) { - return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); -} - -NPY_FINLINE npyv_u8 npyv_orc_b8(npyv_u8 a, npyv_u8 b) { - return __riscv_vor_vv_u8m1(a, __riscv_vnot_v_u8m1(b, 16), 16); -} +NPY_FINLINE npyv_u8 npyv_andc_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, npyv_nlanes_u8), npyv_nlanes_u8); } -NPY_FINLINE npyv_u8 npyv_xnor_b8(npyv_u8 a, npyv_u8 b) { - vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} +#define npyv_andc_b8 npyv_andc_u8 +NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) +{ return __riscv_vor_vv_u8m1(a, __riscv_vnot_v_u8m1(b, npyv_nlanes_u8), npyv_nlanes_u8); } +NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) +{ return __riscv_vnot_v_u8m1(__riscv_vxor_vv_u8m1(a, b, npyv_nlanes_u8), npyv_nlanes_u8); } /*************************** * Comparison ***************************/ // equal -NPY_FINLINE npyv_u8 npyv_cmpeq_u8(npyv_u8 a, npyv_u8 b) { - vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u16 npyv_cmpeq_u16(npyv_u16 a, npyv_u16 b) { - vbool16_t cmp_res = __riscv_vmseq_vv_u16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u32 npyv_cmpeq_u32(npyv_u32 a, npyv_u32 b) { - vbool32_t cmp_res = __riscv_vmseq_vv_u32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u8 npyv_cmpeq_s8(npyv_s8 a, npyv_s8 b) { - vbool8_t cmp_res = __riscv_vmseq_vv_i8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u16 npyv_cmpeq_s16(npyv_s16 a, npyv_s16 b) { - vbool16_t cmp_res = __riscv_vmseq_vv_i16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u32 npyv_cmpeq_s32(npyv_s32 a, npyv_s32 b) { - vbool32_t cmp_res = __riscv_vmseq_vv_i32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpeq_f32(vfloat32m1_t a, vfloat32m1_t b) { - vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u64 npyv_cmpeq_f64(vfloat64m1_t a, vfloat64m1_t b) { - vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpeq_u64(npyv_u64 a, npyv_u64 b) { - vbool64_t cmp_res = __riscv_vmseq_vv_u64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpeq_s64(npyv_s64 a, npyv_s64 b) { - vbool64_t cmp_res = __riscv_vmseq_vv_i64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} +NPY_FINLINE npyv_b8 npyv_cmpeq_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmseq_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpeq_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmseq_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpeq_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmseq_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpeq_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmseq_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmseq_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmseq_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmseq_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmseq_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfeq_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfeq_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } // not Equal -NPY_FINLINE npyv_u8 npyv_cmpneq_u8(npyv_u8 a, npyv_u8 b) { - vbool8_t cmp_res = __riscv_vmseq_vv_u8m1_b8(a, b, 16); - return __riscv_vnot_v_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16), 16); -} - -NPY_FINLINE npyv_u8 npyv_cmpneq_s8(npyv_s8 a, npyv_s8 b) { - vbool8_t cmp_res = __riscv_vmseq_vv_i8m1_b8(a, b, 16); - return __riscv_vnot_v_u8m1(__riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16), 16); -} - -NPY_FINLINE npyv_u16 npyv_cmpneq_u16(npyv_u16 a, npyv_u16 b) { - vbool16_t cmp_res = __riscv_vmseq_vv_u16m1_b16(a, b, 8); - return __riscv_vnot_v_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8), 8); -} - -NPY_FINLINE npyv_u16 npyv_cmpneq_s16(npyv_s16 a, npyv_s16 b) { - vbool16_t cmp_res = __riscv_vmseq_vv_i16m1_b16(a, b, 8); - return __riscv_vnot_v_u16m1(__riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8), 8); -} - -NPY_FINLINE npyv_u32 npyv_cmpneq_u32(npyv_u32 a, npyv_u32 b) { - vbool32_t cmp_res = __riscv_vmseq_vv_u32m1_b32(a, b, 4); - return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpneq_s32(npyv_s32 a, npyv_s32 b) { - vbool32_t cmp_res = __riscv_vmseq_vv_i32m1_b32(a, b, 4); - return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); -} - -#define npyv_cmpneq_u64(A, B) npyv_not_u64(npyv_cmpeq_u64(A, B)) -#define npyv_cmpneq_s64(A, B) npyv_not_u64(npyv_cmpeq_s64(A, B)) - -NPY_FINLINE npyv_u32 npyv_cmpneq_f32(vfloat32m1_t a, vfloat32m1_t b) { - vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, b, 4); - return __riscv_vnot_v_u32m1(__riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4), 4); -} - -#define npyv_cmpneq_f64(A, B) npyv_not_u64(__riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), __riscv_vmfeq_vv_f64m1_b64(A, B, 2), 2)) +NPY_FINLINE npyv_b8 npyv_cmpneq_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsne_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpneq_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsne_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpneq_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsne_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpneq_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsne_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsne_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsne_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsne_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsne_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfne_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfne_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } // greater than -NPY_FINLINE npyv_u8 npyv_cmpgt_u8(npyv_u8 a, npyv_u8 b) { - vbool8_t cmp_res = __riscv_vmsgtu_vv_u8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u8 npyv_cmpgt_s8(npyv_s8 a, npyv_s8 b) { - vbool8_t cmp_res = __riscv_vmsgt_vv_i8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u16 npyv_cmpgt_u16(npyv_u16 a, npyv_u16 b) { - vbool16_t cmp_res = __riscv_vmsgtu_vv_u16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u16 npyv_cmpgt_s16(npyv_s16 a, npyv_s16 b) { - vbool16_t cmp_res = __riscv_vmsgt_vv_i16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u32 npyv_cmpgt_u32(npyv_u32 a, npyv_u32 b) { - vbool32_t cmp_res = __riscv_vmsgtu_vv_u32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpgt_s32(npyv_s32 a, npyv_s32 b) { - vbool32_t cmp_res = __riscv_vmsgt_vv_i32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpgt_f32(vfloat32m1_t a, vfloat32m1_t b) { - vbool32_t cmp_res = __riscv_vmfgt_vv_f32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u64 npyv_cmpgt_f64(vfloat64m1_t a, vfloat64m1_t b) { - vbool64_t cmp_res = __riscv_vmfgt_vv_f64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpgt_u64(npyv_u64 a, npyv_u64 b) { - vbool64_t cmp_res = __riscv_vmsgtu_vv_u64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpgt_s64(npyv_s64 a, npyv_s64 b) { - vbool64_t cmp_res = __riscv_vmsgt_vv_i64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} +NPY_FINLINE npyv_b8 npyv_cmpgt_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsgtu_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpgt_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsgt_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpgt_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsgtu_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpgt_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsgt_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsgtu_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsgt_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsgtu_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsgt_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfgt_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfgt_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } // greater than or equal -NPY_FINLINE npyv_u8 npyv_cmpge_u8(npyv_u8 a, npyv_u8 b) { - vbool8_t cmp_res = __riscv_vmsgeu_vv_u8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u8 npyv_cmpge_s8(npyv_s8 a, npyv_s8 b) { - vbool8_t cmp_res = __riscv_vmsge_vv_i8m1_b8(a, b, 16); - return __riscv_vmerge_vvm_u8m1(__riscv_vmv_v_x_u8m1(0x0, 16), __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), cmp_res, 16); -} - -NPY_FINLINE npyv_u16 npyv_cmpge_u16(npyv_u16 a, npyv_u16 b) { - vbool16_t cmp_res = __riscv_vmsgeu_vv_u16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u16 npyv_cmpge_s16(npyv_s16 a, npyv_s16 b) { - vbool16_t cmp_res = __riscv_vmsge_vv_i16m1_b16(a, b, 8); - return __riscv_vmerge_vvm_u16m1(__riscv_vmv_v_x_u16m1(0x0, 8), __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), cmp_res, 8); -} - -NPY_FINLINE npyv_u32 npyv_cmpge_u32(npyv_u32 a, npyv_u32 b) { - vbool32_t cmp_res = __riscv_vmsgeu_vv_u32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpge_s32(npyv_s32 a, npyv_s32 b) { - vbool32_t cmp_res = __riscv_vmsge_vv_i32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u32 npyv_cmpge_f32(vfloat32m1_t a, vfloat32m1_t b) { - vbool32_t cmp_res = __riscv_vmfge_vv_f32m1_b32(a, b, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - -NPY_FINLINE npyv_u64 npyv_cmpge_f64(vfloat64m1_t a, vfloat64m1_t b) { - vbool64_t cmp_res = __riscv_vmfge_vv_f64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpge_u64(npyv_u64 a, npyv_u64 b) { - vbool64_t cmp_res = __riscv_vmsgeu_vv_u64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} - -NPY_FINLINE npyv_u64 npyv_cmpge_s64(npyv_s64 a, npyv_s64 b) { - vbool64_t cmp_res = __riscv_vmsge_vv_i64m1_b64(a, b, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} +NPY_FINLINE npyv_b8 npyv_cmpge_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsgeu_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpge_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsge_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpge_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsgeu_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpge_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsge_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpge_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsgeu_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpge_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsge_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpge_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsgeu_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpge_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsge_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpge_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfge_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpge_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfge_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } // less than #define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) @@ -781,164 +368,63 @@ NPY_FINLINE npyv_u64 npyv_cmpge_s64(npyv_s64 a, npyv_s64 b) { // check special cases NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) -{ - vbool32_t cmp_res = __riscv_vmfeq_vv_f32m1_b32(a, a, 4); - return __riscv_vmerge_vvm_u32m1(__riscv_vmv_v_x_u32m1(0x0, 4), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), cmp_res, 4); -} - +{ return npyv__to_b32(__riscv_vmfeq_vv_f32m1_b32(a, a, npyv_nlanes_f32)); } NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) -{ - vbool64_t cmp_res = __riscv_vmfeq_vv_f64m1_b64(a, a, 2); - return __riscv_vmerge_vvm_u64m1(__riscv_vmv_v_x_u64m1(0x0, 2), __riscv_vmv_v_x_u64m1(UINT64_MAX, 2), cmp_res, 2); -} +{ return npyv__to_b64(__riscv_vmfeq_vv_f64m1_b64(a, a, npyv_nlanes_f64)); } // Test cross all vector lanes // any: returns true if any of the elements is not equal to zero // all: returns true if all elements are not equal to zero -NPY_FINLINE bool npyv_any_b8(npyv_u8 a) -{ - return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(0, 16), 16)) != 0; -} -NPY_FINLINE bool npyv_all_b8(npyv_u8 a) -{ - return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_v_x_u8m1(UINT8_MAX, 16), 16)) != 0; -} - -NPY_FINLINE bool npyv_any_b16(npyv_b16 a) -{ - return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(0, 8), 8)) != 0; -} - -NPY_FINLINE bool npyv_all_b16(npyv_b16 a) -{ - return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_v_x_u16m1(UINT16_MAX, 8), 8)) != 0; -} - -NPY_FINLINE bool npyv_any_b32(npyv_b32 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(0, 4), 4)) != 0; -} - -NPY_FINLINE bool npyv_all_b32(npyv_b32 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; -} - NPY_FINLINE bool npyv_any_u8(npyv_u8 a) -{ - return npyv_any_b8(npyv_reinterpret_u8_u8(a)); -} - +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u8), npyv_nlanes_u8) != -1; } NPY_FINLINE bool npyv_all_u8(npyv_u8 a) -{ - return npyv_all_b8(npyv_reinterpret_u8_u8(a)); -} - -NPY_FINLINE bool npyv_any_s8(npyv_s8 a) -{ - return npyv_any_b8(npyv_reinterpret_u8_s8(a)); -} - -NPY_FINLINE bool npyv_all_s8(npyv_s8 a) -{ - return npyv_all_b8(npyv_reinterpret_u8_s8(a)); -} - +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u8), npyv_nlanes_u8) == -1; } NPY_FINLINE bool npyv_any_u16(npyv_u16 a) -{ - return npyv_any_b16(npyv_reinterpret_u16_u16(a)); -} - +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u16), npyv_nlanes_u16) != -1; } NPY_FINLINE bool npyv_all_u16(npyv_u16 a) -{ - return npyv_all_b16(npyv_reinterpret_u16_u16(a)); -} - -NPY_FINLINE bool npyv_any_s16(npyv_s16 a) -{ - return npyv_any_b16(npyv_reinterpret_u16_s16(a)); -} - -NPY_FINLINE bool npyv_all_s16(npyv_s16 a) -{ - return npyv_all_b16(npyv_reinterpret_u16_s16(a)); -} - +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u16), npyv_nlanes_u16) == -1; } NPY_FINLINE bool npyv_any_u32(npyv_u32 a) -{ - return npyv_any_b32(npyv_reinterpret_u32_u32(a)); -} - +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u32), npyv_nlanes_u32) != -1; } NPY_FINLINE bool npyv_all_u32(npyv_u32 a) -{ - return npyv_all_b32(npyv_reinterpret_u32_u32(a)); -} +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u32), npyv_nlanes_u32) == -1; } +NPY_FINLINE bool npyv_any_u64(npyv_u64 a) +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u64), npyv_nlanes_u64) != -1; } +NPY_FINLINE bool npyv_all_u64(npyv_u64 a) +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u64), npyv_nlanes_u64) == -1; } -NPY_FINLINE bool npyv_any_s32(npyv_s32 a) -{ - return npyv_any_b32(npyv_reinterpret_u32_s32(a)); -} +#define npyv_any_b8 npyv_any_u8 +#define npyv_all_b8 npyv_all_u8 +#define npyv_any_b16 npyv_any_u16 +#define npyv_all_b16 npyv_all_u16 +#define npyv_any_b32 npyv_any_u32 +#define npyv_all_b32 npyv_all_u32 +#define npyv_any_b64 npyv_any_u64 +#define npyv_all_b64 npyv_all_u64 +NPY_FINLINE bool npyv_any_s8(npyv_s8 a) +{ return npyv_any_u8(npyv_reinterpret_u8_s8(a)); } +NPY_FINLINE bool npyv_all_s8(npyv_s8 a) +{ return npyv_all_u8(npyv_reinterpret_u8_s8(a)); } +NPY_FINLINE bool npyv_any_s16(npyv_s16 a) +{ return npyv_any_u16(npyv_reinterpret_u16_s16(a)); } +NPY_FINLINE bool npyv_all_s16(npyv_s16 a) +{ return npyv_all_u16(npyv_reinterpret_u16_s16(a)); } +NPY_FINLINE bool npyv_any_s32(npyv_s32 a) +{ return npyv_any_u32(npyv_reinterpret_u32_s32(a)); } NPY_FINLINE bool npyv_all_s32(npyv_s32 a) -{ - return npyv_all_b32(npyv_reinterpret_u32_s32(a)); -} - -NPY_FINLINE bool npyv_any_b64(npyv_b64 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(a), __riscv_vmv_v_x_u32m1(0, 4), 4)) != 0; -} - -NPY_FINLINE bool npyv_all_b64(npyv_b64 a) -{ - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(__riscv_vreinterpret_v_u64m1_u32m1(a), __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; -} - -#define npyv_any_u64 npyv_any_b64 - -NPY_FINLINE npyv_u32 vrev64q_u32(npyv_u32 a) { - npyv_u32 vid = __riscv_vid_v_u32m1(2); - npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); - npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); - npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); - return __riscv_vrgather_vv_u32m1(a, idxs, 4); -} - -NPY_FINLINE bool npyv_all_u64(npyv_u64 a) -{ - npyv_u32 a32 = __riscv_vreinterpret_v_u64m1_u32m1(a); - a32 = __riscv_vor_vv_u32m1(a32, vrev64q_u32(a32), 4); - return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a32, __riscv_vmv_v_x_u32m1(UINT32_MAX, 4), 4)) != 0; -} - +{ return npyv_all_u32(npyv_reinterpret_u32_s32(a)); } NPY_FINLINE bool npyv_any_s64(npyv_s64 a) -{ - return npyv_any_u64(__riscv_vreinterpret_v_i64m1_u64m1(a)); -} - +{ return npyv_any_u64(npyv_reinterpret_u64_s64(a)); } NPY_FINLINE bool npyv_all_s64(npyv_s64 a) -{ - return npyv_all_u64(__riscv_vreinterpret_v_i64m1_u64m1(a)); -} +{ return npyv_all_u64(npyv_reinterpret_u64_s64(a)); } NPY_FINLINE bool npyv_any_f32(npyv_f32 a) -{ - return !npyv_all_b32(npyv_cmpeq_f32(a, npyv_zero_f32())); -} - +{ return npyv_any_u32(npyv_reinterpret_u32_f32(__riscv_vfabs(a, npyv_nlanes_f32))); } NPY_FINLINE bool npyv_all_f32(npyv_f32 a) -{ - return !npyv_any_b32(npyv_cmpeq_f32(a, npyv_zero_f32())); -} - +{ return npyv_all_u32(npyv_reinterpret_u32_f32(__riscv_vfabs(a, npyv_nlanes_f32))); } NPY_FINLINE bool npyv_any_f64(npyv_f64 a) -{ - return !npyv_all_b64(npyv_cmpeq_f64(a, npyv_zero_f64())); -} - +{ return npyv_any_u64(npyv_reinterpret_u64_f64(__riscv_vfabs(a, npyv_nlanes_f64))); } NPY_FINLINE bool npyv_all_f64(npyv_f64 a) -{ - return !npyv_any_b64(npyv_cmpeq_f64(a, npyv_zero_f64())); -} +{ return npyv_all_u64(npyv_reinterpret_u64_f64(__riscv_vfabs(a, npyv_nlanes_f64))); } #endif // _NPY_SIMD_RVV_OPERATORS_H diff --git a/numpy/_core/src/common/simd/rvv/reorder.h b/numpy/_core/src/common/simd/rvv/reorder.h index 25ebf62851a5..28a146c3deb0 100644 --- a/numpy/_core/src/common/simd/rvv/reorder.h +++ b/numpy/_core/src/common/simd/rvv/reorder.h @@ -7,164 +7,135 @@ // combine lower part of two vectors NPY_FINLINE npyv_u8 npyv_combinel_u8(npyv_u8 a, npyv_u8 b) -{ - return __riscv_vslideup_vx_u8m1(a, b, 8, 16); -} - +{ return __riscv_vslideup_vx_u8m1(a, b, npyv_nlanes_u8 / 2, npyv_nlanes_u8); } NPY_FINLINE npyv_s8 npyv_combinel_s8(npyv_s8 a, npyv_s8 b) -{ - return __riscv_vslideup_vx_i8m1(a, b, 8, 16); -} - +{ return __riscv_vslideup_vx_i8m1(a, b, npyv_nlanes_s8 / 2, npyv_nlanes_s8); } NPY_FINLINE npyv_u16 npyv_combinel_u16(npyv_u16 a, npyv_u16 b) -{ - return __riscv_vslideup_vx_u16m1(a, b, 4, 8); -} - +{ return __riscv_vslideup_vx_u16m1(a, b, npyv_nlanes_u16 / 2, npyv_nlanes_u16); } NPY_FINLINE npyv_s16 npyv_combinel_s16(npyv_s16 a, npyv_s16 b) -{ - return __riscv_vslideup_vx_i16m1(a, b, 4, 8); -} - +{ return __riscv_vslideup_vx_i16m1(a, b, npyv_nlanes_s16 / 2, npyv_nlanes_s16); } NPY_FINLINE npyv_u32 npyv_combinel_u32(npyv_u32 a, npyv_u32 b) -{ - return __riscv_vslideup_vx_u32m1(a, b, 2, 4); -} - +{ return __riscv_vslideup_vx_u32m1(a, b, npyv_nlanes_u32 / 2, npyv_nlanes_u32); } NPY_FINLINE npyv_s32 npyv_combinel_s32(npyv_s32 a, npyv_s32 b) -{ - return __riscv_vslideup_vx_i32m1(a, b, 2, 4); -} - +{ return __riscv_vslideup_vx_i32m1(a, b, npyv_nlanes_s32 / 2, npyv_nlanes_s32); } NPY_FINLINE npyv_u64 npyv_combinel_u64(npyv_u64 a, npyv_u64 b) -{ - return __riscv_vslideup_vx_u64m1(a, b, 1, 2); -} - +{ return __riscv_vslideup_vx_u64m1(a, b, npyv_nlanes_u64 / 2, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_combinel_s64(npyv_s64 a, npyv_s64 b) -{ - return __riscv_vslideup_vx_i64m1(a, b, 1, 2); -} - +{ return __riscv_vslideup_vx_i64m1(a, b, npyv_nlanes_s64 / 2, npyv_nlanes_s64); } NPY_FINLINE npyv_f32 npyv_combinel_f32(npyv_f32 a, npyv_f32 b) -{ - return __riscv_vslideup_vx_f32m1(a, b, 2, 4); -} - +{ return __riscv_vslideup_vx_f32m1(a, b, npyv_nlanes_f32 / 2, npyv_nlanes_f32); } NPY_FINLINE npyv_f64 npyv_combinel_f64(npyv_f64 a, npyv_f64 b) -{ - return __riscv_vslideup_vx_f64m1(a, b, 1, 2); -} +{ return __riscv_vslideup_vx_f64m1(a, b, npyv_nlanes_f64 / 2, npyv_nlanes_f64); } // combine higher part of two vectors NPY_FINLINE npyv_u8 npyv_combineh_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vslideup_vx_u8m1( - __riscv_vslidedown_vx_u8m1(a, 8, 16), - __riscv_vslidedown_vx_u8m1(b, 8, 16), - 8, - 16 + __riscv_vslidedown_vx_u8m1(a, npyv_nlanes_u8 / 2, npyv_nlanes_u8), + __riscv_vslidedown_vx_u8m1(b, npyv_nlanes_u8 / 2, npyv_nlanes_u8), + npyv_nlanes_u8 / 2, + npyv_nlanes_u8 ); } NPY_FINLINE npyv_u16 npyv_combineh_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vslideup_vx_u16m1( - __riscv_vslidedown_vx_u16m1(a, 4, 8), - __riscv_vslidedown_vx_u16m1(b, 4, 8), - 4, - 8 + __riscv_vslidedown_vx_u16m1(a, npyv_nlanes_u16 / 2, npyv_nlanes_u16), + __riscv_vslidedown_vx_u16m1(b, npyv_nlanes_u16 / 2, npyv_nlanes_u16), + npyv_nlanes_u16 / 2, + npyv_nlanes_u16 ); } NPY_FINLINE npyv_u32 npyv_combineh_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vslideup_vx_u32m1( - __riscv_vslidedown_vx_u32m1(a, 2, 4), - __riscv_vslidedown_vx_u32m1(b, 2, 4), - 2, - 4 + __riscv_vslidedown_vx_u32m1(a, npyv_nlanes_u32 / 2, npyv_nlanes_u32), + __riscv_vslidedown_vx_u32m1(b, npyv_nlanes_u32 / 2, npyv_nlanes_u32), + npyv_nlanes_u32 / 2, + npyv_nlanes_u32 ); } NPY_FINLINE npyv_u64 npyv_combineh_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vslideup_vx_u64m1( - __riscv_vslidedown_vx_u64m1(a, 1, 2), - __riscv_vslidedown_vx_u64m1(b, 1, 2), - 1, - 2 + __riscv_vslidedown_vx_u64m1(a, npyv_nlanes_u64 / 2, npyv_nlanes_u64), + __riscv_vslidedown_vx_u64m1(b, npyv_nlanes_u64 / 2, npyv_nlanes_u64), + npyv_nlanes_u64 / 2, + npyv_nlanes_u64 ); } NPY_FINLINE npyv_s8 npyv_combineh_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vslideup_vx_i8m1( - __riscv_vslidedown_vx_i8m1(a, 8, 16), - __riscv_vslidedown_vx_i8m1(b, 8, 16), - 8, - 16 + __riscv_vslidedown_vx_i8m1(a, npyv_nlanes_s8 / 2, npyv_nlanes_s8), + __riscv_vslidedown_vx_i8m1(b, npyv_nlanes_s8 / 2, npyv_nlanes_s8), + npyv_nlanes_s8 / 2, + npyv_nlanes_s8 ); } NPY_FINLINE npyv_s16 npyv_combineh_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vslideup_vx_i16m1( - __riscv_vslidedown_vx_i16m1(a, 4, 8), - __riscv_vslidedown_vx_i16m1(b, 4, 8), - 4, - 8 + __riscv_vslidedown_vx_i16m1(a, npyv_nlanes_s16 / 2, npyv_nlanes_s16), + __riscv_vslidedown_vx_i16m1(b, npyv_nlanes_s16 / 2, npyv_nlanes_s16), + npyv_nlanes_s16 / 2, + npyv_nlanes_s16 ); } NPY_FINLINE npyv_s32 npyv_combineh_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vslideup_vx_i32m1( - __riscv_vslidedown_vx_i32m1(a, 2, 4), - __riscv_vslidedown_vx_i32m1(b, 2, 4), - 2, - 4 + __riscv_vslidedown_vx_i32m1(a, npyv_nlanes_s32 / 2, npyv_nlanes_s32), + __riscv_vslidedown_vx_i32m1(b, npyv_nlanes_s32 / 2, npyv_nlanes_s32), + npyv_nlanes_s32 / 2, + npyv_nlanes_s32 ); } NPY_FINLINE npyv_s64 npyv_combineh_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vslideup_vx_i64m1( - __riscv_vslidedown_vx_i64m1(a, 1, 2), - __riscv_vslidedown_vx_i64m1(b, 1, 2), - 1, - 2 + __riscv_vslidedown_vx_i64m1(a, npyv_nlanes_s64 / 2, npyv_nlanes_s64), + __riscv_vslidedown_vx_i64m1(b, npyv_nlanes_s64 / 2, npyv_nlanes_s64), + npyv_nlanes_s64 / 2, + npyv_nlanes_s64 ); } NPY_FINLINE npyv_f32 npyv_combineh_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vslideup_vx_f32m1( - __riscv_vslidedown_vx_f32m1(a, 2, 4), - __riscv_vslidedown_vx_f32m1(b, 2, 4), - 2, - 4 + __riscv_vslidedown_vx_f32m1(a, npyv_nlanes_f32 / 2, npyv_nlanes_f32), + __riscv_vslidedown_vx_f32m1(b, npyv_nlanes_f32 / 2, npyv_nlanes_f32), + npyv_nlanes_f32 / 2, + npyv_nlanes_f32 ); } NPY_FINLINE npyv_f64 npyv_combineh_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vslideup_vx_f64m1( - __riscv_vslidedown_vx_f64m1(a, 1, 2), - __riscv_vslidedown_vx_f64m1(b, 1, 2), - 1, - 2 + __riscv_vslidedown_vx_f64m1(a, npyv_nlanes_f64 / 2, npyv_nlanes_f64), + __riscv_vslidedown_vx_f64m1(b, npyv_nlanes_f64 / 2, npyv_nlanes_f64), + npyv_nlanes_f64 / 2, + npyv_nlanes_f64 ); } // combine two vectors from lower and higher parts of two other vectors -#define NPYV_IMPL_RVV_COMBINE(T_VEC, SFX) \ +#define NPYV_IMPL_RVV_COMBINE(T_VEC, SFX) \ NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ { \ - T_VEC##x2 r; \ - r.val[0] = NPY_CAT(npyv_combinel_, SFX)(a, b); \ - r.val[1] = NPY_CAT(npyv_combineh_, SFX)(a, b); \ - return r; \ + return (T_VEC##x2){{ \ + npyv_combinel_##SFX(a, b), \ + npyv_combineh_##SFX(a, b) \ + }}; \ } NPYV_IMPL_RVV_COMBINE(npyv_u8, u8) @@ -177,794 +148,121 @@ NPYV_IMPL_RVV_COMBINE(npyv_u64, u64) NPYV_IMPL_RVV_COMBINE(npyv_s64, s64) NPYV_IMPL_RVV_COMBINE(npyv_f32, f32) NPYV_IMPL_RVV_COMBINE(npyv_f64, f64) +#undef NPYV_IMPL_RVV_COMBINE // interleave & deinterleave two vectors -NPY_FINLINE npyv_u8 vzip1q_u8(npyv_u8 a, npyv_u8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 interleave_idx = __riscv_vsrl_vx_u8m1(index, 1, vl); - - npyv_u8 a_gathered = __riscv_vrgather_vv_u8m1(a, interleave_idx, vl); - npyv_u8 b_gathered = __riscv_vrgather_vv_u8m1(b, interleave_idx, vl); - - vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( - __riscv_vand_vx_u8m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_u8m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_u8 vzip2q_u8(npyv_u8 a, npyv_u8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 interleave_idx = __riscv_vadd_vx_u8m1( - __riscv_vsrl_vx_u8m1(index, 1, vl), - 8, - vl - ); - - npyv_u8 a_gathered = __riscv_vrgather_vv_u8m1(a, interleave_idx, vl); - npyv_u8 b_gathered = __riscv_vrgather_vv_u8m1(b, interleave_idx, vl); - - vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( - __riscv_vand_vx_u8m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_u8m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_u8 vuzp1q_u8(npyv_u8 a, npyv_u8 b) { - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 gather_idx_a = __riscv_vmul_vx_u8m1(index, 2, vl); - npyv_u8 gather_idx_b = gather_idx_a; - - vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); - - gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); - - npyv_u8 result_a = __riscv_vrgather_vv_u8m1(a, gather_idx_a, vl); - npyv_u8 result_b = __riscv_vrgather_vv_u8m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u8m1(result_a, result_b, high_mask, vl); -} - -NPY_FINLINE npyv_u8 vuzp2q_u8(npyv_u8 a, npyv_u8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 gather_idx_a = __riscv_vadd_vx_u8m1( - __riscv_vmul_vx_u8m1(index, 2, vl), - 1, - vl - ); - - npyv_u8 gather_idx_b = gather_idx_a; - - vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); - - gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); - - npyv_u8 result_a = __riscv_vrgather_vv_u8m1(a, gather_idx_a, vl); - npyv_u8 result_b = __riscv_vrgather_vv_u8m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u8m1(result_a, result_b, high_mask, vl); -} - -NPY_FINLINE npyv_s8 vzip1q_s8(npyv_s8 a, npyv_s8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 interleave_idx = __riscv_vsrl_vx_u8m1(index, 1, vl); - - npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, interleave_idx, vl); - npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, interleave_idx, vl); - - vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( - __riscv_vand_vx_u8m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_s8 vzip2q_s8(npyv_s8 a, npyv_s8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 interleave_idx = __riscv_vadd_vx_u8m1( - __riscv_vsrl_vx_u8m1(index, 1, vl), - 8, - vl - ); - - npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, interleave_idx, vl); - npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, interleave_idx, vl); - - vbool8_t mask = __riscv_vmsne_vx_u8m1_b8( - __riscv_vand_vx_u8m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_s8 vuzp1q_s8(npyv_s8 a, npyv_s8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 gather_idx_a = __riscv_vmul_vx_u8m1(index, 2, vl); - npyv_u8 gather_idx_b = gather_idx_a; - - vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); - - gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); - - npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, gather_idx_a, vl); - npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_s8 vuzp2q_s8(npyv_s8 a, npyv_s8 b) -{ - size_t vl = __riscv_vsetvlmax_e8m1(); - npyv_u8 index = __riscv_vid_v_u8m1(vl); - - npyv_u8 gather_idx_a = __riscv_vadd_vx_u8m1( - __riscv_vmul_vx_u8m1(index, 2, vl), - 1, - vl - ); - - npyv_u8 gather_idx_b = gather_idx_a; - - vbool8_t high_mask = __riscv_vmsgtu_vx_u8m1_b8(index, 7, vl); - - gather_idx_b = __riscv_vsub_vx_u8m1(gather_idx_b, 16, vl); - - npyv_s8 a_gathered = __riscv_vrgather_vv_i8m1(a, gather_idx_a, vl); - npyv_s8 b_gathered = __riscv_vrgather_vv_i8m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i8m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_u16 vzip1q_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 interleave_idx = __riscv_vsrl_vx_u16m1(index, 1, vl); - - npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, interleave_idx, vl); - npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, interleave_idx, vl); - - vbool16_t mask = __riscv_vmsne_vx_u16m1_b16( - __riscv_vand_vx_u16m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_u16 vzip2q_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 interleave_idx = __riscv_vadd_vx_u16m1( - __riscv_vsrl_vx_u16m1(index, 1, vl), - 4, - vl - ); - - npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, interleave_idx, vl); - npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, interleave_idx, vl); - - vbool16_t mask = __riscv_vmsne_vx_u16m1_b16( - __riscv_vand_vx_u16m1(index, 1, vl), - 0, - vl - ); - - return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, mask, vl); -} - -NPY_FINLINE npyv_u16 vuzp1q_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx_a = __riscv_vmul_vx_u16m1(index, 2, vl); - - npyv_u16 gather_idx_b = __riscv_vmul_vx_u16m1( - __riscv_vsub_vx_u16m1(index, 4, vl), - 2, - vl - ); - - npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); - vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); - - npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, gather_idx_a, vl); - npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_u16 vuzp2q_u16(npyv_u16 a, npyv_u16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx_a = __riscv_vadd_vx_u16m1( - __riscv_vmul_vx_u16m1(index, 2, vl), - 1, - vl - ); - - npyv_u16 gather_idx_b = __riscv_vadd_vx_u16m1( - __riscv_vmul_vx_u16m1( - __riscv_vsub_vx_u16m1(index, 4, vl), - 2, - vl - ), - 1, - vl - ); - - npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); - vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); - - npyv_u16 a_gathered = __riscv_vrgather_vv_u16m1(a, gather_idx_a, vl); - npyv_u16 b_gathered = __riscv_vrgather_vv_u16m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u16m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_s16 vzip1q_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx = __riscv_vsrl_vx_u16m1(index, 1, vl); - - vbool16_t sel_mask = __riscv_vmsne_vx_u16m1_b16( - __riscv_vand_vx_u16m1(index, 1, vl), - 0, - vl - ); - - npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx, vl); - npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_s16 vzip2q_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx = __riscv_vadd_vx_u16m1( - __riscv_vsrl_vx_u16m1(index, 1, vl), - 4, - vl - ); - - vbool16_t sel_mask = __riscv_vmsne_vx_u16m1_b16( - __riscv_vand_vx_u16m1(index, 1, vl), - 0, - vl - ); - - npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx, vl); - npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_s16 vuzp1q_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx_a = __riscv_vmul_vx_u16m1(index, 2, vl); - - npyv_u16 gather_idx_b = __riscv_vmul_vx_u16m1( - __riscv_vsub_vx_u16m1(index, 4, vl), - 2, - vl - ); - - npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); - vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); - - npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx_a, vl); - npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_s16 vuzp2q_s16(npyv_s16 a, npyv_s16 b) -{ - size_t vl = __riscv_vsetvlmax_e16m1(); - - npyv_u16 index = __riscv_vid_v_u16m1(vl); - - npyv_u16 gather_idx_a = __riscv_vadd_vx_u16m1( - __riscv_vmul_vx_u16m1(index, 2, vl), - 1, - vl - ); - - npyv_u16 gather_idx_b = __riscv_vadd_vx_u16m1( - __riscv_vmul_vx_u16m1( - __riscv_vsub_vx_u16m1(index, 4, vl), - 2, - vl - ), - 1, - vl - ); - - npyv_s16 signed_index = __riscv_vreinterpret_v_u16m1_i16m1(index); - vbool16_t high_mask = __riscv_vmsgt_vx_i16m1_b16(signed_index, 3, vl); - - npyv_s16 a_gathered = __riscv_vrgather_vv_i16m1(a, gather_idx_a, vl); - npyv_s16 b_gathered = __riscv_vrgather_vv_i16m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i16m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_u32 vzip1q_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); - - vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( - __riscv_vand_vx_u32m1(index, 1, vl), - 0, - vl - ); - - npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx, vl); - npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_u32 vzip2q_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( - __riscv_vsrl_vx_u32m1(index, 1, vl), - 2, - vl - ); - - vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( - __riscv_vand_vx_u32m1(index, 1, vl), - 0, - vl - ); - - npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx, vl); - npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_u32 vuzp1q_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); - - npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx_a, vl); - npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_u32 vuzp2q_u32(npyv_u32 a, npyv_u32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1(index, 2, vl), - 1, - vl - ); - - npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ), - 1, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_u32 a_gathered = __riscv_vrgather_vv_u32m1(a, gather_idx_a, vl); - npyv_u32 b_gathered = __riscv_vrgather_vv_u32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_u32m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_s32 vzip1q_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); - - vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( - __riscv_vand_vx_u32m1(index, 1, vl), - 0, - vl - ); - - npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx, vl); - npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_s32 vzip2q_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( - __riscv_vsrl_vx_u32m1(index, 1, vl), - 2, - vl - ); - - vbool32_t sel_mask = __riscv_vmsne_vx_u32m1_b32( - __riscv_vand_vx_u32m1(index, 1, vl), - 0, - vl - ); - - npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx, vl); - npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_s32 vuzp1q_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); - - npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx_a, vl); - npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_s32 vuzp2q_s32(npyv_s32 a, npyv_s32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1(index, 2, vl), - 1, - vl - ); - - npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ), - 1, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_s32 a_gathered = __riscv_vrgather_vv_i32m1(a, gather_idx_a, vl); - npyv_s32 b_gathered = __riscv_vrgather_vv_i32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_i32m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_f32 vzip1q_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vsrl_vx_u32m1(index, 1, vl); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t sel_mask = __riscv_vmsgt_vx_i32m1_b32( - __riscv_vand_vx_i32m1(signed_index, 1, vl), - 0, - vl - ); - - npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx, vl); - npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_f32 vzip2q_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx = __riscv_vadd_vx_u32m1( - __riscv_vsrl_vx_u32m1(index, 1, vl), - 2, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t sel_mask = __riscv_vmsgt_vx_i32m1_b32( - __riscv_vand_vx_i32m1(signed_index, 1, vl), - 0, - vl - ); - - npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx, vl); - npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx, vl); - - return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, sel_mask, vl); -} - -NPY_FINLINE npyv_f32 vuzp1q_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vmul_vx_u32m1(index, 2, vl); - - npyv_u32 gather_idx_b = __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx_a, vl); - npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, high_mask, vl); -} - -NPY_FINLINE npyv_f32 vuzp2q_f32(npyv_f32 a, npyv_f32 b) -{ - size_t vl = __riscv_vsetvlmax_e32m1(); - - npyv_u32 index = __riscv_vid_v_u32m1(vl); - - npyv_u32 gather_idx_a = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1(index, 2, vl), - 1, - vl - ); - - npyv_u32 gather_idx_b = __riscv_vadd_vx_u32m1( - __riscv_vmul_vx_u32m1( - __riscv_vsub_vx_u32m1(index, 2, vl), - 2, - vl - ), - 1, - vl - ); - - npyv_s32 signed_index = __riscv_vreinterpret_v_u32m1_i32m1(index); - vbool32_t high_mask = __riscv_vmsgt_vx_i32m1_b32(signed_index, 1, vl); - - npyv_f32 a_gathered = __riscv_vrgather_vv_f32m1(a, gather_idx_a, vl); - npyv_f32 b_gathered = __riscv_vrgather_vv_f32m1(b, gather_idx_b, vl); - - return __riscv_vmerge_vvm_f32m1(a_gathered, b_gathered, high_mask, vl); -} - -// interleave & deinterleave two vectors -#define NPYV_IMPL_RVV_ZIP(T_VEC, SFX) \ +#define NPYV_IMPL_RVV_ZIP(T_VEC, SFX, R_SFX, EEW) \ NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ { \ - T_VEC##x2 r; \ - r.val[0] = vzip1q_##SFX(a, b); \ - r.val[1] = vzip2q_##SFX(a, b); \ - return r; \ + const int vl = npyv_nlanes_##SFX; \ + npyv_lanetype_##SFX v[vl * 2]; \ + __riscv_vsseg2##EEW( \ + v, __riscv_vcreate_v_##R_SFX##m1x2(a, b), vl \ + ); \ + return (T_VEC##x2){{ \ + __riscv_vl##EEW##_v_##R_SFX##m1(v , vl), \ + __riscv_vl##EEW##_v_##R_SFX##m1(v + vl, vl) \ + }}; \ } \ NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ { \ - T_VEC##x2 r; \ - r.val[0] = vuzp1q_##SFX(a, b); \ - r.val[1] = vuzp2q_##SFX(a, b); \ - return r; \ + const int vl = npyv_nlanes_##SFX; \ + npyv_lanetype_##SFX v[vl * 2]; \ + __riscv_vs##EEW(v , a, vl); \ + __riscv_vs##EEW(v + vl, b, vl); \ + npyv__##SFX##x2 d = \ + __riscv_vlseg2##EEW##_v_##R_SFX##m1x2(v, vl); \ + return (T_VEC##x2){{ \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(d, 0), \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(d, 1) \ + }}; \ } -NPYV_IMPL_RVV_ZIP(npyv_u8, u8) -NPYV_IMPL_RVV_ZIP(npyv_s8, s8) -NPYV_IMPL_RVV_ZIP(npyv_u16, u16) -NPYV_IMPL_RVV_ZIP(npyv_s16, s16) -NPYV_IMPL_RVV_ZIP(npyv_u32, u32) -NPYV_IMPL_RVV_ZIP(npyv_s32, s32) -NPYV_IMPL_RVV_ZIP(npyv_f32, f32) - -#define NPYV_IMPL_RVV_ZIP2(SFX) \ - NPY_FINLINE npyv_##SFX##x2 npyv_zip_##SFX(npyv_##SFX a, npyv_##SFX b) \ - { \ - npyv_##SFX##x2 r; \ - r.val[0] = npyv_combinel_##SFX(a, b); \ - r.val[1] = npyv_combineh_##SFX(a, b); \ - return r; \ - } \ - \ - NPY_FINLINE npyv_##SFX##x2 npyv_unzip_##SFX(npyv_##SFX a, npyv_##SFX b) \ - { \ - npyv_##SFX##x2 r; \ - r.val[0] = npyv_combinel_##SFX(a, b); \ - r.val[1] = npyv_combineh_##SFX(a, b); \ - return r; \ - } - -NPYV_IMPL_RVV_ZIP2(u64) -NPYV_IMPL_RVV_ZIP2(s64) -NPYV_IMPL_RVV_ZIP2(f64) +NPYV_IMPL_RVV_ZIP(npyv_u8, u8, u8, e8) +NPYV_IMPL_RVV_ZIP(npyv_s8, s8, i8, e8) +NPYV_IMPL_RVV_ZIP(npyv_u16, u16, u16, e16) +NPYV_IMPL_RVV_ZIP(npyv_s16, s16, i16, e16) +NPYV_IMPL_RVV_ZIP(npyv_u32, u32, u32, e32) +NPYV_IMPL_RVV_ZIP(npyv_s32, s32, i32, e32) +NPYV_IMPL_RVV_ZIP(npyv_u64, u64, u64, e64) +NPYV_IMPL_RVV_ZIP(npyv_s64, s64, i64, e64) +NPYV_IMPL_RVV_ZIP(npyv_f32, f32, f32, e32) +NPYV_IMPL_RVV_ZIP(npyv_f64, f64, f64, e64) +#undef NPYV_IMPL_RVV_ZIP // Reverse elements of each 64-bit lane NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) { - npyv_u8 vid = __riscv_vid_v_u8m1(8); - npyv_u8 vid_slideup = __riscv_vslideup_vx_u8m1(vid, vid, 8, 16); - npyv_u8 sub = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(7, 16), __riscv_vmv_v_x_u8m1(7 + 8, 16), 8, 16); - npyv_u8 idxs = __riscv_vsub_vv_u8m1(sub, vid_slideup, 16); - return __riscv_vrgather_vv_u8m1(a, idxs, 16); -} - -NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a) { - npyv_u8 vid = __riscv_vid_v_u8m1(8); - npyv_u8 vid_slideup = __riscv_vslideup_vx_u8m1(vid, vid, 8, 16); - npyv_u8 sub = __riscv_vslideup_vx_u8m1(__riscv_vmv_v_x_u8m1(7,16), __riscv_vmv_v_x_u8m1(7 + 8, 16), 8, 16); - npyv_u8 idxs = __riscv_vsub_vv_u8m1(sub, vid_slideup, 16); - return __riscv_vrgather_vv_i8m1(a, idxs, 16); + vuint8m1_t vid = __riscv_vid_v_u8m1(npyv_nlanes_u8); + vuint8m1_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 3, npyv_nlanes_u8), 4, npyv_nlanes_u8), 7, npyv_nlanes_u8); + vuint8m1_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u8); + return __riscv_vrgather(a, idxs, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_u8m1_i8m1(npyv_rev64_u8(__riscv_vreinterpret_v_i8m1_u8m1(a))); } -NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) { - npyv_u16 vid = __riscv_vid_v_u16m1(4); - npyv_u16 vid_slideup = __riscv_vslideup_vx_u16m1(vid, vid, 4, 8); - npyv_u16 sub = __riscv_vslideup_vx_u16m1(__riscv_vmv_v_x_u16m1(3, 8), __riscv_vmv_v_x_u16m1(3 + 4, 8), 4, 8); - npyv_u16 idxs = __riscv_vsub_vv_u16m1(sub, vid_slideup, 8); - return __riscv_vrgather_vv_u16m1(a, idxs, 8); -} - -NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a) { - npyv_u16 vid = __riscv_vid_v_u16m1(4); - npyv_u16 vid_slideup = __riscv_vslideup_vx_u16m1(vid, vid, 4, 8); - npyv_u16 sub = __riscv_vslideup_vx_u16m1(__riscv_vmv_v_x_u16m1(3, 8), __riscv_vmv_v_x_u16m1(3 + 4, 8), 4, 8); - npyv_u16 idxs = __riscv_vsub_vv_u16m1(sub, vid_slideup, 8); - return __riscv_vrgather_vv_i16m1(a, idxs, 8); -} - -NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) { - npyv_u32 vid = __riscv_vid_v_u32m1(2); - npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); - npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); - npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); - return __riscv_vrgather_vv_u32m1(a, idxs, 4); -} - -NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) { - npyv_u32 vid = __riscv_vid_v_u32m1(2); - npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); - npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); - npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); - return __riscv_vrgather_vv_i32m1(a, idxs, 4); +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ + vuint16m1_t vid = __riscv_vid_v_u16m1(npyv_nlanes_u16); + vuint16m1_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 2, npyv_nlanes_u16), 3, npyv_nlanes_u16), 3, npyv_nlanes_u16); + vuint16m1_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u16); + return __riscv_vrgather(a, idxs, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_u16m1_i16m1(npyv_rev64_u16(__riscv_vreinterpret_v_i16m1_u16m1(a))); } -NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) { - npyv_u32 vid = __riscv_vid_v_u32m1(2); - npyv_u32 vid_slideup = __riscv_vslideup_vx_u32m1(vid, vid, 2, 4); - npyv_u32 sub = __riscv_vslideup_vx_u32m1(__riscv_vmv_v_x_u32m1(1, 4), __riscv_vmv_v_x_u32m1(1 + 2, 4), 2, 4); - npyv_u32 idxs = __riscv_vsub_vv_u32m1(sub, vid_slideup, 4); - return __riscv_vrgather_vv_f32m1(a, idxs, 4); +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + vuint16mf2_t vid = __riscv_vid_v_u16mf2(npyv_nlanes_u16 / 2); + vuint16mf2_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 1, npyv_nlanes_u16 / 2), 2, npyv_nlanes_u16 / 2), 1, npyv_nlanes_u16 / 2); + vuint16mf2_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u16 / 2); + return __riscv_vrgatherei16(a, idxs, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_u32m1_i32m1(npyv_rev64_u32(__riscv_vreinterpret_v_i32m1_u32m1(a))); } +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(npyv_rev64_u32(__riscv_vreinterpret_v_f32m1_u32m1(a))); } // Permuting the elements of each 128-bit lane by immediate index for // each element. -#define npyv_permi128_u32(A, E0, E1, E2, E3) \ - npyv_set_u32( \ - __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E0, 4)), __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E1, 4)), \ - __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E2, 4)), __riscv_vmv_x_s_u32m1_u32(__riscv_vslidedown_vx_u32m1(A, E3, 4)) \ - ) -#define npyv_permi128_s32(A, E0, E1, E2, E3) \ - npyv_set_s32( \ - __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E0, 4)), __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E1, 4)), \ - __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E2, 4)), __riscv_vmv_x_s_i32m1_i32(__riscv_vslidedown_vx_i32m1(A, E3, 4)) \ - ) -#define npyv_permi128_f32(A, E0, E1, E2, E3) \ - npyv_set_f32( \ - __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E0, 4)), __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E1, 4)), \ - __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E2, 4)), __riscv_vfmv_f_s_f32m1_f32(__riscv_vslidedown_vx_f32m1(A, E3, 4)) \ - ) - -#define npyv_permi128_u64(A, E0, E1) \ - npyv_set_u64( \ - __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(A, E0, 2)), __riscv_vmv_x_s_u64m1_u64(__riscv_vslidedown_vx_u64m1(A, E1, 2)) \ - ) -#define npyv_permi128_s64(A, E0, E1) \ - npyv_set_s64( \ - __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(A, E0, 2)), __riscv_vmv_x_s_i64m1_i64(__riscv_vslidedown_vx_i64m1(A, E1, 2)) \ - ) -#define npyv_permi128_f64(A, E0, E1) \ - npyv_set_f64( \ - __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(A, E0, 2)), __riscv_vfmv_f_s_f64m1_f64(__riscv_vslidedown_vx_f64m1(A, E1, 2)) \ - ) +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + ({ \ + const uint16_t v[] = { \ + E0 , E1 , E2 , E3 , \ + E0 + 4, E1 + 4, E2 + 4, E3 + 4, \ + E0 + 8, E1 + 8, E2 + 8, E3 + 8, \ + E0 + 12, E1 + 12, E2 + 12, E3 + 12, \ + E0 + 16, E1 + 16, E2 + 16, E3 + 16, \ + E0 + 20, E1 + 20, E2 + 20, E3 + 20, \ + E0 + 24, E1 + 24, E2 + 24, E3 + 24, \ + E0 + 28, E1 + 28, E2 + 28, E3 + 28 \ + }; \ + __riscv_vrgatherei16( \ + A, __riscv_vle16_v_u16mf2(v, npyv_nlanes_u32), \ + npyv_nlanes_u32 \ + ); \ + }) +#define npyv_permi128_s32(A, E0, E1, E2, E3) __riscv_vreinterpret_v_u32m1_i32m1(npyv_permi128_u32(__riscv_vreinterpret_v_i32m1_u32m1(A), E0, E1, E2, E3)) +#define npyv_permi128_f32(A, E0, E1, E2, E3) __riscv_vreinterpret_v_u32m1_f32m1(npyv_permi128_u32(__riscv_vreinterpret_v_f32m1_u32m1(A), E0, E1, E2, E3)) + +#define npyv_permi128_u64(A, E0, E1) \ + ({ \ + const uint16_t v[] = { \ + E0 , E1 , \ + E0 + 2, E1 + 2, \ + E0 + 4, E1 + 4, \ + E0 + 6, E1 + 6, \ + E0 + 8, E1 + 8, \ + E0 + 10, E1 + 10, \ + E0 + 12, E1 + 12, \ + E0 + 14, E1 + 14 \ + }; \ + __riscv_vrgatherei16( \ + A, __riscv_vle16_v_u16mf4(v, npyv_nlanes_u64), \ + npyv_nlanes_u64 \ + ); \ + }) +#define npyv_permi128_s64(A, E0, E1) __riscv_vreinterpret_v_u64m1_i64m1(npyv_permi128_u64(__riscv_vreinterpret_v_i64m1_u64m1(A), E0, E1)) +#define npyv_permi128_f64(A, E0, E1) __riscv_vreinterpret_v_u64m1_f64m1(npyv_permi128_u64(__riscv_vreinterpret_v_f64m1_u64m1(A), E0, E1)) #endif // _NPY_SIMD_RVV_REORDER_H diff --git a/numpy/_core/src/common/simd/rvv/rvv.h b/numpy/_core/src/common/simd/rvv/rvv.h index 41a4d8fd42ec..f798976da8a1 100644 --- a/numpy/_core/src/common/simd/rvv/rvv.h +++ b/numpy/_core/src/common/simd/rvv/rvv.h @@ -2,9 +2,13 @@ #error "Not a standalone header" #endif -#define NPY_SIMD 128 -#define NPY_SIMD_WIDTH 16 -#define NPY_SIMD_F32 0 +#include + +// supports VLEN 128, 256 and 512 +// it is impossible to implement npyv_tobits_b8 when VLEN>512 +#define NPY_SIMD __riscv_v_fixed_vlen +#define NPY_SIMD_WIDTH (__riscv_v_fixed_vlen / 8) +#define NPY_SIMD_F32 1 #define NPY_SIMD_F64 1 #ifdef NPY_HAVE_FMA3 @@ -35,12 +39,14 @@ typedef vfloat64m1_t fixed_vfloat64m1_t __attribute__((riscv_rvv_vector_bits(__r #define npyv_s16 fixed_vint16m1_t #define npyv_s32 fixed_vint32m1_t #define npyv_s64 fixed_vint64m1_t +#define npyv_f32 fixed_vfloat32m1_t +#define npyv_f64 fixed_vfloat64m1_t + +// simulate bool as uint due to gcc/clang bugs, change to fixed_vbool if possible #define npyv_b8 fixed_vuint8m1_t #define npyv_b16 fixed_vuint16m1_t #define npyv_b32 fixed_vuint32m1_t #define npyv_b64 fixed_vuint64m1_t -#define npyv_f32 fixed_vfloat32m1_t -#define npyv_f64 fixed_vfloat64m1_t typedef struct { fixed_vuint8m1_t val[2]; } npyv_u8x2; @@ -66,16 +72,30 @@ typedef struct { fixed_vint64m1_t val[3]; } npyv_s64x3; typedef struct { fixed_vfloat32m1_t val[3]; } npyv_f32x3; typedef struct { fixed_vfloat64m1_t val[3]; } npyv_f64x3; -#define npyv_nlanes_u8 16 -#define npyv_nlanes_s8 16 -#define npyv_nlanes_u16 8 -#define npyv_nlanes_s16 8 -#define npyv_nlanes_u32 4 -#define npyv_nlanes_s32 4 -#define npyv_nlanes_u64 2 -#define npyv_nlanes_s64 2 -#define npyv_nlanes_f32 4 -#define npyv_nlanes_f64 2 + +// helper types +#define npyv__u8x2 vuint8m1x2_t +#define npyv__u16x2 vuint16m1x2_t +#define npyv__u32x2 vuint32m1x2_t +#define npyv__u64x2 vuint64m1x2_t +#define npyv__s8x2 vint8m1x2_t +#define npyv__s16x2 vint16m1x2_t +#define npyv__s32x2 vint32m1x2_t +#define npyv__s64x2 vint64m1x2_t +#define npyv__f32x2 vfloat32m1x2_t +#define npyv__f64x2 vfloat64m1x2_t + + +#define npyv_nlanes_u8 32 +#define npyv_nlanes_s8 32 +#define npyv_nlanes_u16 16 +#define npyv_nlanes_s16 16 +#define npyv_nlanes_u32 8 +#define npyv_nlanes_s32 8 +#define npyv_nlanes_u64 4 +#define npyv_nlanes_s64 4 +#define npyv_nlanes_f32 8 +#define npyv_nlanes_f64 4 #include "memory.h" #include "misc.h" diff --git a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src index 9f7ed6c1dfc4..c824d4aa3ae0 100644 --- a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src +++ b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src @@ -313,7 +313,7 @@ npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, NPY_FINLINE npyv_u32 npyv_signbit_f32(npyv_f32 v) { - return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1); + return npyv_reinterpret_u32_s32(npyv_shri_s32(npyv_reinterpret_s32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1)); } NPY_FINLINE npyv_u8 npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) @@ -345,7 +345,7 @@ npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) NPY_FINLINE npyv_u64 npyv_signbit_f64(npyv_f64 v) { - return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1); + return npyv_reinterpret_u64_s64(npyv_shri_s64(npyv_reinterpret_s64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1)); } NPY_FINLINE npyv_u8 npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, @@ -486,10 +486,26 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; - #if npyv_nlanes_@sfx@ == 4 + #if npyv_nlanes_@sfx@ >= 4 op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; #endif + #if npyv_nlanes_@sfx@ >= 8 + op[4 * ostride] = lane[4 * sizeof(npyv_lanetype_@sfx@)]; + op[5 * ostride] = lane[5 * sizeof(npyv_lanetype_@sfx@)]; + op[6 * ostride] = lane[6 * sizeof(npyv_lanetype_@sfx@)]; + op[7 * ostride] = lane[7 * sizeof(npyv_lanetype_@sfx@)]; + #endif + #if npyv_nlanes_@sfx@ >= 16 + op[ 8 * ostride] = lane[ 8 * sizeof(npyv_lanetype_@sfx@)]; + op[ 9 * ostride] = lane[ 9 * sizeof(npyv_lanetype_@sfx@)]; + op[10 * ostride] = lane[10 * sizeof(npyv_lanetype_@sfx@)]; + op[11 * ostride] = lane[11 * sizeof(npyv_lanetype_@sfx@)]; + op[12 * ostride] = lane[12 * sizeof(npyv_lanetype_@sfx@)]; + op[13 * ostride] = lane[13 * sizeof(npyv_lanetype_@sfx@)]; + op[14 * ostride] = lane[14 * sizeof(npyv_lanetype_@sfx@)]; + op[15 * ostride] = lane[15 * sizeof(npyv_lanetype_@sfx@)]; + #endif } #undef PACK_FACTOR diff --git a/numpy/_core/tests/test_cpu_features.py b/numpy/_core/tests/test_cpu_features.py index 722000561a78..0797bb8c772c 100644 --- a/numpy/_core/tests/test_cpu_features.py +++ b/numpy/_core/tests/test_cpu_features.py @@ -432,12 +432,3 @@ class Test_LOONGARCH_Features(AbstractTest): def load_flags(self): self.load_flags_cpuinfo("Features") - - -is_riscv = re.match("^(riscv64)", machine, re.IGNORECASE) -@pytest.mark.skipif(not is_linux or not is_riscv, reason="Only for Linux and RISC-V") -class Test_RISCV_Features(AbstractTest): - features = ["RVV"] - - def load_flags(self): - self.load_flags_cpuinfo("Features") \ No newline at end of file From f79260002858831288742694f2096496d3553d85 Mon Sep 17 00:00:00 2001 From: amane-ame Date: Fri, 11 Apr 2025 23:46:01 +0800 Subject: [PATCH 3/3] Fix load2 and store2. --- numpy/_core/src/common/simd/rvv/memory.h | 77 +++++++++++++++++------- 1 file changed, 54 insertions(+), 23 deletions(-) diff --git a/numpy/_core/src/common/simd/rvv/memory.h b/numpy/_core/src/common/simd/rvv/memory.h index 713732fff08d..84d343d0aa01 100644 --- a/numpy/_core/src/common/simd/rvv/memory.h +++ b/numpy/_core/src/common/simd/rvv/memory.h @@ -242,7 +242,11 @@ NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) //// 64-bit load over 32-bit stride NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) -{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vlse64_v_u64m1((const uint64_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u64)); } +{ + uint32_t v[npyv_nlanes_u32]; + __riscv_vsseg2e32(v, __riscv_vlsseg2e32_v_u32mf2x2((const uint32_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u32 / 2), npyv_nlanes_u32 / 2); + return __riscv_vle32_v_u32m1(v, npyv_nlanes_u32); +} NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) { return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) @@ -251,9 +255,9 @@ NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) //// 128-bit load over 64-bit stride NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) { - vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); - id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); - return __riscv_vloxei64_v_u64m1((const uint64_t*)ptr, id, npyv_nlanes_u64); + uint64_t v[npyv_nlanes_u64]; + __riscv_vsseg2e64(v, __riscv_vlsseg2e64_v_u64m1x2((const uint64_t*)ptr, stride * sizeof(uint64_t), npyv_nlanes_u64 / 2), npyv_nlanes_u64 / 2); + return __riscv_vle64_v_u64m1(v, npyv_nlanes_u64); } NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) { return npyv_reinterpret_s64_u64(npyv_loadn2_u64((const npy_uint64*)ptr, stride)); } @@ -280,7 +284,11 @@ NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) //// 64-bit store over 32-bit stride NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) -{ __riscv_vsse64((uint64_t*)ptr, stride * sizeof(uint32_t), __riscv_vreinterpret_v_u32m1_u64m1(a), npyv_nlanes_u64); } +{ + uint32_t v[npyv_nlanes_u32]; + __riscv_vse32(v, a, npyv_nlanes_u32); + __riscv_vssseg2e32((uint32_t*)ptr, stride * sizeof(uint32_t), __riscv_vlseg2e32_v_u32mf2x2(v, npyv_nlanes_u32 / 2), npyv_nlanes_u32 / 2); +} NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) { npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) @@ -289,9 +297,9 @@ NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) //// 128-bit store over 64-bit stride NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) { - vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); - id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); - __riscv_vsoxei64((uint64_t*)ptr, id, a, npyv_nlanes_u64); + uint64_t v[npyv_nlanes_u64]; + __riscv_vse64(v, a, npyv_nlanes_u64); + __riscv_vssseg2e64((uint64_t*)ptr, stride * sizeof(uint64_t), __riscv_vlseg2e64_v_u64m1x2(v, npyv_nlanes_u64 / 2), npyv_nlanes_u64 / 2); } NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) { npyv_storen2_u64((npy_uint64*)ptr, stride, npyv_reinterpret_u64_s64(a)); } @@ -317,10 +325,13 @@ NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) //// 64-bit nlane NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) -{ return __riscv_vreinterpret_v_i64m1_i32m1(npyv_load_till_s64((const npy_int64*)ptr, nlane, (uint64_t)fill_hi << 32 | fill_lo)); } +{ + const vint32m1_t fill = __riscv_vreinterpret_i32m1(__riscv_vmv_v_x_i64m1((int64_t)fill_hi << 32 | fill_lo, npyv_nlanes_s64)); + return __riscv_vle32_v_i32m1_tu(fill, (const int32_t*)ptr, nlane * 2); +} // fill zero to rest lanes NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) -{ return __riscv_vreinterpret_v_i64m1_i32m1(npyv_load_tillz_s64((const npy_int64*)ptr, nlane)); } +{ return npyv_load_tillz_s32(ptr, nlane * 2); } //// 128-bit nlane NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, @@ -330,7 +341,7 @@ NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, return __riscv_vle64_v_i64m1_tu(fill, (const int64_t*)ptr, nlane * 2); } NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) -{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(0, npyv_nlanes_s64), (const int64_t*)ptr, nlane * 2); } +{ return npyv_load_tillz_s64(ptr, nlane * 2); } /********************************* * Non-contiguous partial load @@ -349,7 +360,17 @@ NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, //// 64-bit load over 32-bit stride NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill_lo, npy_int32 fill_hi) -{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vlse64_v_i64m1_tu(__riscv_vmv_v_x_i64m1((uint64_t)fill_hi << 32 | fill_lo, npyv_nlanes_s64), (const int64_t*)ptr, stride * sizeof(int32_t), nlane)); } +{ +#if npyv_nlanes_s32 == 4 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s32 == 8 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s32 == 16 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#endif + __riscv_vsseg2e32(v, __riscv_vlsseg2e32_v_i32mf2x2((const int32_t*)ptr, stride * sizeof(int32_t), nlane), nlane); + return __riscv_vle32_v_i32m1(v, npyv_nlanes_s32); +} NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } @@ -357,11 +378,16 @@ NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill_lo, npy_int64 fill_hi) { - vbool64_t mask = __riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)); - vint64m1_t fill = __riscv_vmerge(__riscv_vmv_v_x_i64m1(fill_lo, npyv_nlanes_s64), fill_hi, mask, npyv_nlanes_s64); - vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); - id = __riscv_vadd_vx_u64m1_mu(mask, id, id, sizeof(uint64_t), npyv_nlanes_u64); - return __riscv_vloxei64_v_i64m1_tu(fill, (const int64_t*)ptr, id, nlane * 2); +#if npyv_nlanes_s64 == 4 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s64 == 8 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s64 == 16 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#endif + nlane = nlane < npyv_nlanes_s64 / 2 ? nlane : npyv_nlanes_s64 / 2; + __riscv_vsseg2e64(v, __riscv_vlsseg2e64_v_i64m1x2((const int64_t*)ptr, stride * sizeof(int64_t), nlane), nlane); + return __riscv_vle64_v_i64m1(v, npyv_nlanes_s64); } NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) { return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } @@ -379,11 +405,11 @@ NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a //// 64-bit nlane NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) -{ npyv_store_till_s64((npy_int64*)ptr, nlane, npyv_reinterpret_s64_s32(a)); } +{ npyv_store_till_s32(ptr, nlane * 2, a); } //// 128-bit nlane NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) -{ npyv_store_till_s64((npy_int64*)ptr, nlane * 2, a); } +{ npyv_store_till_s64(ptr, nlane * 2, a); } /********************************* * Non-contiguous partial store @@ -398,14 +424,19 @@ NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp //// 64-bit store over 32-bit stride NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) -{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int32_t), __riscv_vreinterpret_v_i32m1_i64m1(a), nlane); } +{ + int32_t v[npyv_nlanes_s32]; + __riscv_vse32(v, a, npyv_nlanes_s32); + __riscv_vssseg2e32((int32_t*)ptr, stride * sizeof(int32_t), __riscv_vlseg2e32_v_i32mf2x2(v, nlane), nlane); +} //// 128-bit store over 64-bit stride NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) { - vuint64m1_t id = __riscv_vmul(__riscv_vsrl(__riscv_vid_v_u64m1(npyv_nlanes_u64), 1, npyv_nlanes_u64), stride * sizeof(uint64_t), npyv_nlanes_u64); - id = __riscv_vadd_vx_u64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), id, id, sizeof(uint64_t), npyv_nlanes_u64); - __riscv_vsoxei64((int64_t*)ptr, id, a, nlane * 2); + int64_t v[npyv_nlanes_s64]; + nlane = nlane < npyv_nlanes_s64 / 2 ? nlane : npyv_nlanes_s64 / 2; + __riscv_vse64(v, a, npyv_nlanes_s64); + __riscv_vssseg2e64((int64_t*)ptr, stride * sizeof(int64_t), __riscv_vlseg2e64_v_i64m1x2(v, nlane), nlane); } /*****************************************************************