diff --git a/.github/workflows/linux_qemu.yml b/.github/workflows/linux_qemu.yml index 1293e9c37c2f..0f9c81c55d2b 100644 --- a/.github/workflows/linux_qemu.yml +++ b/.github/workflows/linux_qemu.yml @@ -73,14 +73,6 @@ jobs: "(test_kind or test_multiarray or test_simd or test_umath or test_ufunc) and not test_gcd_overflow", "s390x" ] - - [ - "riscv64", - "riscv64-linux-gnu", - "riscv64/ubuntu:22.04", - "-Dallow-noblas=true", - "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", - "riscv64" - ] env: TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }} DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }} @@ -170,7 +162,7 @@ jobs: '" - linux_loongarch64_qemu: + linux_loongarch64_riscv64_qemu: # Only workflow_dispatch is enabled on forks. # To enable this job and subsequent jobs on a fork for other events, comment out: if: github.repository == 'numpy/numpy' || github.event_name == 'workflow_dispatch' @@ -267,3 +259,108 @@ jobs: /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' cd /numpy && spin test -- -k \"${RUNTIME_TEST_FILTER}\" '" + + + linux_riscv64_qemu: + # To enable this workflow on a fork, comment out: + if: github.repository == 'numpy/numpy' + runs-on: ubuntu-24.04 + continue-on-error: true + strategy: + fail-fast: false + matrix: + BUILD_PROP: + - [ + "riscv64", + "riscv64-linux-gnu", + "riscv64/ubuntu:24.04", + "-Dallow-noblas=true", + "test_kind or test_multiarray or test_simd or test_umath or test_ufunc", + "riscv64" + ] + env: + TOOLCHAIN_NAME: ${{ matrix.BUILD_PROP[1] }} + DOCKER_CONTAINER: ${{ matrix.BUILD_PROP[2] }} + MESON_OPTIONS: ${{ matrix.BUILD_PROP[3] }} + RUNTIME_TEST_FILTER: ${{ matrix.BUILD_PROP[4] }} + ARCH: ${{ matrix.BUILD_PROP[5] }} + TERM: xterm-256color + + name: "${{ matrix.BUILD_PROP[0] }}" + steps: + - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 + with: + submodules: recursive + fetch-tags: true + persist-credentials: false + + - name: Initialize binfmt_misc for qemu-user-static + run: | + docker run --rm --privileged multiarch/qemu-user-static --reset -p yes + + - name: Install GCC cross-compilers + run: | + sudo apt update + sudo apt install -y ninja-build gcc-14-${TOOLCHAIN_NAME} g++-14-${TOOLCHAIN_NAME} gfortran-14-${TOOLCHAIN_NAME} + + - name: Cache docker container + uses: actions/cache@1bd1e32a3bdc45362d1e726936510720a7c30a57 # v4.2.0 + id: container-cache + with: + path: ~/docker_${{ matrix.BUILD_PROP[1] }} + key: container-${{ runner.os }}-${{ matrix.BUILD_PROP[1] }}-${{ matrix.BUILD_PROP[2] }}-${{ hashFiles('requirements/build_requirements.txt') }} + + - name: Creates new container + if: steps.container-cache.outputs.cache-hit != 'true' + run: | + docker run --platform=linux/${ARCH} --name the_container --interactive \ + -v /:/host -v $(pwd):/numpy ${DOCKER_CONTAINER} /bin/bash -c " + apt update && + apt install -y cmake git python3 python-is-python3 python3-dev python3-pip && + mkdir -p /lib64 && ln -s /host/lib64/ld-* /lib64/ && + ln -s /host/lib/x86_64-linux-gnu /lib/x86_64-linux-gnu && + rm -rf /usr/${TOOLCHAIN_NAME} && ln -s /host/usr/${TOOLCHAIN_NAME} /usr/${TOOLCHAIN_NAME} && + rm -rf /usr/lib/gcc/${TOOLCHAIN_NAME} && ln -s /host/usr/lib/gcc-cross/${TOOLCHAIN_NAME} /usr/lib/gcc/${TOOLCHAIN_NAME} && + rm -f /usr/bin/gcc && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gcc-14 /usr/bin/gcc && + rm -f /usr/bin/g++ && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-g++-14 /usr/bin/g++ && + rm -f /usr/bin/gfortran && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-gfortran-14 /usr/bin/gfortran && + rm -f /usr/bin/ar && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ar /usr/bin/ar && + rm -f /usr/bin/as && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-as /usr/bin/as && + rm -f /usr/bin/ld && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld /usr/bin/ld && + rm -f /usr/bin/ld.bfd && ln -s /host/usr/bin/${TOOLCHAIN_NAME}-ld.bfd /usr/bin/ld.bfd && + rm -f /usr/bin/ninja && ln -s /host/usr/bin/ninja /usr/bin/ninja && + git config --global --add safe.directory /numpy && + # No need to build ninja from source, the host ninja is used for the build + grep -v ninja /numpy/requirements/build_requirements.txt > /tmp/build_requirements.txt && + python -m pip install --break-system-packages -r /tmp/build_requirements.txt && + python -m pip install --break-system-packages pytest pytest-xdist hypothesis typing_extensions pytest-timeout && + rm -f /usr/local/bin/ninja && mkdir -p /usr/local/bin && ln -s /host/usr/bin/ninja /usr/local/bin/ninja + " + docker commit the_container the_container + mkdir -p "~/docker_${TOOLCHAIN_NAME}" + docker save -o "~/docker_${TOOLCHAIN_NAME}/the_container.tar" the_container + + - name: Load container from cache + if: steps.container-cache.outputs.cache-hit == 'true' + run: docker load -i "~/docker_${TOOLCHAIN_NAME}/the_container.tar" + + - name: Meson Build + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + cd /numpy && spin build --clean -- ${MESON_OPTIONS} + '" + + - name: Meson Log + if: always() + run: 'cat build/meson-logs/meson-log.txt' + + - name: Run Tests + run: | + docker run --rm --platform=linux/${ARCH} -e "TERM=xterm-256color" \ + -v $(pwd):/numpy -v /:/host the_container \ + /bin/script -e -q -c "/bin/bash --noprofile --norc -eo pipefail -c ' + export F90=/usr/bin/gfortran + cd /numpy && spin test -- --timeout=600 --durations=10 -k \"${RUNTIME_TEST_FILTER}\" + '" diff --git a/meson.options b/meson.options index 1be05d324756..4989828d22e1 100644 --- a/meson.options +++ b/meson.options @@ -35,7 +35,7 @@ option('test-simd', type: 'array', 'VSX', 'VSX2', 'VSX3', 'VSX4', 'NEON', 'ASIMD', 'VX', 'VXE', 'VXE2', - 'LSX', + 'LSX', 'RVV', ], description: 'Specify a list of CPU features to be tested against NumPy SIMD interface') option('test-simd-args', type: 'string', value: '', diff --git a/meson_cpu/meson.build b/meson_cpu/meson.build index e5b6d0fbe7be..5c5cc018a036 100644 --- a/meson_cpu/meson.build +++ b/meson_cpu/meson.build @@ -97,7 +97,7 @@ min_features = { 's390x': [], 'arm': [], 'aarch64': [ASIMD], - 'riscv64': [], + 'riscv64': [RVV], 'wasm32': [], 'loongarch64': [LSX], }.get(cpu_family, []) diff --git a/numpy/_core/meson.build b/numpy/_core/meson.build index d1c78910b2a3..229bc5cfce47 100644 --- a/numpy/_core/meson.build +++ b/numpy/_core/meson.build @@ -103,6 +103,10 @@ if host_machine.cpu_family() == 'loongarch64' add_project_arguments(['-DHWY_COMPILE_ONLY_SCALAR'], language: ['cpp']) endif +if host_machine.cpu_family() == 'riscv64' + add_project_arguments('-march=rv64gcv_zvl256b', '-mrvv-vector-bits=256', language: ['c','cpp']) +endif + use_highway = not get_option('disable-highway') if use_highway and not fs.exists('src/highway/README.md') error('Missing the `highway` git submodule! Run `git submodule update --init` to fix this.') @@ -750,6 +754,7 @@ _umath_tests_mtargets = mod_features.multi_targets( ASIMDHP, ASIMD, NEON, VSX3, VSX2, VSX, VXE, VX, + RVV, ], baseline: CPU_BASELINE, prefix: 'NPY_', @@ -794,7 +799,8 @@ foreach gen_mtargets : [ AVX512_SKX, AVX2, XOP, SSE42, SSE2, VSX2, ASIMD, NEON, - VXE, VX + VXE, VX, + RVV, ] ], ] @@ -897,6 +903,7 @@ foreach gen_mtargets : [ VSX3, VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -908,6 +915,7 @@ foreach gen_mtargets : [ VSX4, VSX2, VX, LSX, + RVV, ] ], [ @@ -919,6 +927,7 @@ foreach gen_mtargets : [ NEON, VXE, VX, LSX, + RVV, ] ], [ @@ -937,6 +946,7 @@ foreach gen_mtargets : [ NEON_VFPV4, VXE, LSX, + RVV, ] ], [ @@ -960,6 +970,7 @@ foreach gen_mtargets : [ VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -978,6 +989,7 @@ foreach gen_mtargets : [ NEON_VFPV4, VXE2, VXE, LSX, + RVV, ] ], [ @@ -994,6 +1006,7 @@ foreach gen_mtargets : [ VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -1005,6 +1018,7 @@ foreach gen_mtargets : [ ASIMD, NEON, VXE, VX, LSX, + RVV, ] ], [ @@ -1015,6 +1029,7 @@ foreach gen_mtargets : [ VSX2, ASIMD, NEON, LSX, + RVV, ] ], [ @@ -1026,6 +1041,7 @@ foreach gen_mtargets : [ VSX3, VSX2, VXE, VX, LSX, + RVV, ] ], [ @@ -1037,6 +1053,7 @@ foreach gen_mtargets : [ VSX2, VX, LSX, + RVV, ] ], ] diff --git a/numpy/_core/src/common/simd/intdiv.h b/numpy/_core/src/common/simd/intdiv.h index 0284d49d23bb..1e84159e2e04 100644 --- a/numpy/_core/src/common/simd/intdiv.h +++ b/numpy/_core/src/common/simd/intdiv.h @@ -220,6 +220,10 @@ NPY_FINLINE npyv_u8x3 npyv_divisor_u8(npy_uint8 d) divisor.val[0] = npyv_setall_u8(m); divisor.val[1] = npyv_setall_u8(sh1); divisor.val[2] = npyv_setall_u8(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[0] = npyv_setall_u8(m); + divisor.val[1] = npyv_setall_u8(sh1); + divisor.val[2] = npyv_setall_u8(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -253,7 +257,7 @@ NPY_FINLINE npyv_s8x3 npyv_divisor_s8(npy_int8 d) npyv_s8x3 divisor; divisor.val[0] = npyv_setall_s8(m); divisor.val[2] = npyv_setall_s8(d < 0 ? -1 : 0); - #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_LSX) + #if defined(NPY_HAVE_VSX2) || defined(NPY_HAVE_VX) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV) divisor.val[1] = npyv_setall_s8(sh); #elif defined(NPY_HAVE_NEON) divisor.val[1] = npyv_setall_s8(-sh); @@ -298,6 +302,9 @@ NPY_FINLINE npyv_u16x3 npyv_divisor_u16(npy_uint16 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u16(sh1); divisor.val[2] = npyv_setall_u16(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u16(sh1); + divisor.val[2] = npyv_setall_u16(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -330,6 +337,8 @@ NPY_FINLINE npyv_s16x3 npyv_divisor_s16(npy_int16 d) divisor.val[1] = npyv_setall_s16(-sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s16(sh); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s16(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -370,6 +379,9 @@ NPY_FINLINE npyv_u32x3 npyv_divisor_u32(npy_uint32 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u32(sh1); divisor.val[2] = npyv_setall_u32(sh2); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u32(sh1); + divisor.val[2] = npyv_setall_u32(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -407,6 +419,8 @@ NPY_FINLINE npyv_s32x3 npyv_divisor_s32(npy_int32 d) divisor.val[1] = npyv_setall_s32(-sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s32(sh); +#elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s32(sh); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -444,6 +458,9 @@ NPY_FINLINE npyv_u64x3 npyv_divisor_u64(npy_uint64 d) #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_u64(sh1); divisor.val[2] = npyv_setall_u64(sh2); + #elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_u64(sh1); + divisor.val[2] = npyv_setall_u64(sh2); #else #error "please initialize the shifting operand for the new architecture" #endif @@ -484,6 +501,8 @@ NPY_FINLINE npyv_s64x3 npyv_divisor_s64(npy_int64 d) divisor.val[1] = npyv_set_s64(sh); #elif defined(NPY_HAVE_LSX) divisor.val[1] = npyv_setall_s64(sh); + #elif defined(NPY_HAVE_RVV) + divisor.val[1] = npyv_setall_s64(sh); #else #error "please initialize the shifting operand for the new architecture" #endif diff --git a/numpy/_core/src/common/simd/rvv/arithmetic.h b/numpy/_core/src/common/simd/rvv/arithmetic.h new file mode 100644 index 000000000000..ca924e18fedd --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/arithmetic.h @@ -0,0 +1,226 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_ARITHMETIC_H +#define _NPY_SIMD_RVV_ARITHMETIC_H + +/*************************** + * Addition + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_add_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vadd_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_add_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vadd_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_add_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vadd_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_add_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vadd_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_add_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vadd_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_add_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vadd_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_add_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vadd_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_add_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vadd_vv_i64m1(a, b, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_add_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfadd_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_add_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfadd_vv_f64m1(a, b, npyv_nlanes_f64); } + +// saturated +NPY_FINLINE npyv_u8 npyv_adds_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsaddu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_adds_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsadd_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_adds_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsaddu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_adds_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsadd_vv_i16m1(a, b, npyv_nlanes_s16); } + +/*************************** + * Subtraction + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_sub_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vsub_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_sub_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vsub_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_sub_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vsub_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_sub_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vsub_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_sub_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vsub_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_sub_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vsub_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_sub_u64(npyv_u64 a, npyv_u64 b) { return __riscv_vsub_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_sub_s64(npyv_s64 a, npyv_s64 b) { return __riscv_vsub_vv_i64m1(a, b, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_sub_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfsub_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_sub_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfsub_vv_f64m1(a, b, npyv_nlanes_f64); } + +// saturated +NPY_FINLINE npyv_u8 npyv_subs_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vssubu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_subs_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vssub_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_subs_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vssubu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_subs_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vssub_vv_i16m1(a, b, npyv_nlanes_s16); } + +/*************************** + * Multiplication + ***************************/ +// non-saturated +NPY_FINLINE npyv_u8 npyv_mul_u8(npyv_u8 a, npyv_u8 b) { return __riscv_vmul_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_mul_s8(npyv_s8 a, npyv_s8 b) { return __riscv_vmul_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_mul_u16(npyv_u16 a, npyv_u16 b) { return __riscv_vmul_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_mul_s16(npyv_s16 a, npyv_s16 b) { return __riscv_vmul_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_mul_u32(npyv_u32 a, npyv_u32 b) { return __riscv_vmul_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_mul_s32(npyv_s32 a, npyv_s32 b) { return __riscv_vmul_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_f32 npyv_mul_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfmul_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_mul_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfmul_vv_f64m1(a, b, npyv_nlanes_f64); } + +/*************************** + * Integer Division + ***************************/ +// See simd/intdiv.h for more clarification +// divide each unsigned 8-bit element by a precomputed divisor +NPY_FINLINE npyv_u8 npyv_divc_u8(npyv_u8 a, const npyv_u8x3 divisor) +{ + // high part of unsigned multiplication + vuint8m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u8); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint8m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u8); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u8); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u8); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u8); + + return q; +} +// divide each signed 8-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s8 npyv_divc_s8(npyv_s8 a, const npyv_s8x3 divisor) +{ + vint8m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s8); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint8m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s8), __riscv_vreinterpret_v_i8m1_u8m1(divisor.val[1]), npyv_nlanes_s8); + q = __riscv_vsub(q, __riscv_vsra(a, 7, npyv_nlanes_s8), npyv_nlanes_s8); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s8), divisor.val[2], npyv_nlanes_s8); + return q; +} +// divide each unsigned 16-bit element by a precomputed divisor +NPY_FINLINE npyv_u16 npyv_divc_u16(npyv_u16 a, const npyv_u16x3 divisor) +{ + // high part of unsigned multiplication + vuint16m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u16); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint16m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u16); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u16); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u16); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u16); + return q; +} +// divide each signed 16-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s16 npyv_divc_s16(npyv_s16 a, const npyv_s16x3 divisor) +{ + // high part of signed multiplication + vint16m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s16); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint16m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s16), __riscv_vreinterpret_v_i16m1_u16m1(divisor.val[1]), npyv_nlanes_s16); + q = __riscv_vsub(q, __riscv_vsra(a, 15, npyv_nlanes_s16), npyv_nlanes_s16); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s16), divisor.val[2], npyv_nlanes_s16); + return q; +} +// divide each unsigned 32-bit element by a precomputed divisor +NPY_FINLINE npyv_u32 npyv_divc_u32(npyv_u32 a, const npyv_u32x3 divisor) +{ + // high part of unsigned multiplication + vuint32m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u32); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint32m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u32); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u32); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u32); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u32); + + return q; +} +// divide each signed 32-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s32 npyv_divc_s32(npyv_s32 a, const npyv_s32x3 divisor) +{ + // high part of signed multiplication + vint32m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s32); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint32m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s32), __riscv_vreinterpret_v_i32m1_u32m1(divisor.val[1]), npyv_nlanes_s32); + q = __riscv_vsub(q, __riscv_vsra(a, 31, npyv_nlanes_s32), npyv_nlanes_s32); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s32), divisor.val[2], npyv_nlanes_s32); + return q; +} +// divide each unsigned 64-bit element by a precomputed divisor +NPY_FINLINE npyv_u64 npyv_divc_u64(npyv_u64 a, const npyv_u64x3 divisor) +{ + // high part of unsigned multiplication + vuint64m1_t mulhi = __riscv_vmulhu(a, divisor.val[0], npyv_nlanes_u64); + // floor(a/d) = (mulhi + ((a-mulhi) >> sh1)) >> sh2 + vuint64m1_t q = __riscv_vsub(a, mulhi, npyv_nlanes_u64); + q = __riscv_vsrl(q, divisor.val[1], npyv_nlanes_u64); + q = __riscv_vadd(mulhi, q, npyv_nlanes_u64); + q = __riscv_vsrl(q, divisor.val[2], npyv_nlanes_u64); + + return q; +} +// divide each signed 64-bit element by a precomputed divisor (round towards zero) +NPY_FINLINE npyv_s64 npyv_divc_s64(npyv_s64 a, const npyv_s64x3 divisor) +{ + // high part of signed multiplication + vint64m1_t mulhi = __riscv_vmulh(a, divisor.val[0], npyv_nlanes_s64); + // q = ((a + mulhi) >> sh1) - XSIGN(a) + // trunc(a/d) = (q ^ dsign) - dsign + vint64m1_t q = __riscv_vsra(__riscv_vadd(a, mulhi, npyv_nlanes_s64), __riscv_vreinterpret_v_i64m1_u64m1(divisor.val[1]), npyv_nlanes_s64); + q = __riscv_vsub(q, __riscv_vsra(a, 63, npyv_nlanes_s64), npyv_nlanes_s64); + q = __riscv_vsub(__riscv_vxor(q, divisor.val[2], npyv_nlanes_s64), divisor.val[2], npyv_nlanes_s64); + return q; +} + +/*************************** + * Division + ***************************/ +NPY_FINLINE npyv_f32 npyv_div_f32(npyv_f32 a, npyv_f32 b) { return __riscv_vfdiv_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_div_f64(npyv_f64 a, npyv_f64 b) { return __riscv_vfdiv_vv_f64m1(a, b, npyv_nlanes_f64); } + +/*************************** + * FUSED F32 + ***************************/ +// multiply and add, a*b + c +NPY_FINLINE npyv_f32 npyv_muladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfmadd_vv_f32m1(a, b, c, npyv_nlanes_f32); } +// multiply and subtract, a*b - c +NPY_FINLINE npyv_f32 npyv_mulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfmsub_vv_f32m1(a, b, c, npyv_nlanes_f32); } +// negate multiply and add, -(a*b) + c +NPY_FINLINE npyv_f32 npyv_nmuladd_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfnmsub_vv_f32m1(a, b, c, npyv_nlanes_f32); } +// negate multiply and subtract, -(a*b) - c +NPY_FINLINE npyv_f32 npyv_nmulsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vfnmadd_vv_f32m1(a, b, c, npyv_nlanes_f32); } + +// multiply, add for odd elements and subtract even elements. +// (a * b) -+ c +NPY_FINLINE npyv_f32 npyv_muladdsub_f32(npyv_f32 a, npyv_f32 b, npyv_f32 c) +{ return npyv_muladd_f32(a, b, __riscv_vfneg_v_f32m1_mu(__riscv_vreinterpret_v_u8m1_b32(__riscv_vmv_v_x_u8m1(0x55, npyv_nlanes_u8)), c, c, npyv_nlanes_f32)); } + +/*************************** + * FUSED F64 + ***************************/ +NPY_FINLINE npyv_f64 npyv_muladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfmadd_vv_f64m1(a, b, c, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_mulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfmsub_vv_f64m1(a, b, c, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_nmuladd_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfnmsub_vv_f64m1(a, b, c, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_nmulsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vfnmadd_vv_f64m1(a, b, c, npyv_nlanes_f64); } + +NPY_FINLINE npyv_f64 npyv_muladdsub_f64(npyv_f64 a, npyv_f64 b, npyv_f64 c) +{ return npyv_muladd_f64(a, b, __riscv_vfneg_v_f64m1_mu(__riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0x55, npyv_nlanes_u8)), c, c, npyv_nlanes_f64)); } + +/*************************** + * Summation + ***************************/ +// reduce sum across vector +NPY_FINLINE npy_uint32 npyv_sum_u32(npyv_u32 a) +{ return __riscv_vmv_x(__riscv_vredsum(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u32)); } +NPY_FINLINE npy_uint64 npyv_sum_u64(npyv_u64 a) +{ return __riscv_vmv_x(__riscv_vredsum(a, __riscv_vmv_s_x_u64m1(0, 1), npyv_nlanes_u64)); } +NPY_FINLINE float npyv_sum_f32(npyv_f32 a) +{ return __riscv_vfmv_f(__riscv_vfredosum(a, __riscv_vfmv_s_f_f32m1(0, 1), npyv_nlanes_f32)); } +NPY_FINLINE double npyv_sum_f64(npyv_f64 a) +{ return __riscv_vfmv_f(__riscv_vfredosum(a, __riscv_vfmv_s_f_f64m1(0, 1), npyv_nlanes_f64)); } + +NPY_FINLINE npy_uint16 npyv_sumup_u8(npyv_u8 a) +{ return __riscv_vmv_x(__riscv_vwredsumu(a, __riscv_vmv_s_x_u16m1(0, 1), npyv_nlanes_u8)); } +NPY_FINLINE npy_uint32 npyv_sumup_u16(npyv_u16 a) +{ return __riscv_vmv_x(__riscv_vwredsumu(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u16)); } + +#endif // _NPY_SIMD_RVV_ARITHMETIC_H diff --git a/numpy/_core/src/common/simd/rvv/conversion.h b/numpy/_core/src/common/simd/rvv/conversion.h new file mode 100644 index 000000000000..6399dca4b5d8 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/conversion.h @@ -0,0 +1,117 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_CVT_H +#define _NPY_SIMD_RVV_CVT_H + +#define npyv_cvt_u8_b8(A) A +#define npyv_cvt_u16_b16(A) A +#define npyv_cvt_u32_b32(A) A +#define npyv_cvt_u64_b64(A) A +#define npyv_cvt_s8_b8(A) __riscv_vreinterpret_v_u8m1_i8m1(npyv_cvt_u8_b8(A)) +#define npyv_cvt_s16_b16(A) __riscv_vreinterpret_v_u16m1_i16m1(npyv_cvt_u16_b16(A)) +#define npyv_cvt_s32_b32(A) __riscv_vreinterpret_v_u32m1_i32m1(npyv_cvt_u32_b32(A)) +#define npyv_cvt_s64_b64(A) __riscv_vreinterpret_v_u64m1_i64m1(npyv_cvt_u64_b64(A)) +#define npyv_cvt_f32_b32(A) __riscv_vreinterpret_v_u32m1_f32m1(npyv_cvt_u32_b32(A)) +#define npyv_cvt_f64_b64(A) __riscv_vreinterpret_v_u64m1_f64m1(npyv_cvt_u64_b64(A)) + +#define npyv_cvt_b8_u8(A) A +#define npyv_cvt_b16_u16(A) A +#define npyv_cvt_b32_u32(A) A +#define npyv_cvt_b64_u64(A) A +#define npyv_cvt_b8_s8(A) npyv_cvt_b8_u8(__riscv_vreinterpret_v_i8m1_u8m1(A)) +#define npyv_cvt_b16_s16(A) npyv_cvt_b16_u16(__riscv_vreinterpret_v_i16m1_u16m1(A)) +#define npyv_cvt_b32_s32(A) npyv_cvt_b32_u32(__riscv_vreinterpret_v_i32m1_u32m1(A)) +#define npyv_cvt_b64_s64(A) npyv_cvt_b64_u64(__riscv_vreinterpret_v_i64m1_u64m1(A)) +#define npyv_cvt_b32_f32(A) npyv_cvt_b32_u32(__riscv_vreinterpret_v_f32m1_u32m1(A)) +#define npyv_cvt_b64_f64(A) npyv_cvt_b64_u64(__riscv_vreinterpret_v_f64m1_u64m1(A)) + +#define npyv__from_b8(A) __riscv_vmseq_vx_u8m1_b8(A, UINT8_MAX, npyv_nlanes_u8) +#define npyv__from_b16(A) __riscv_vmseq_vx_u16m1_b16(A, UINT16_MAX, npyv_nlanes_u16) +#define npyv__from_b32(A) __riscv_vmseq_vx_u32m1_b32(A, UINT32_MAX, npyv_nlanes_u32) +#define npyv__from_b64(A) __riscv_vmseq_vx_u64m1_b64(A, UINT64_MAX, npyv_nlanes_u64) +#define npyv__to_b8(A) __riscv_vmerge_vxm_u8m1(__riscv_vmv_v_x_u8m1(0, npyv_nlanes_u8), UINT8_MAX, A, npyv_nlanes_u8) +#define npyv__to_b16(A) __riscv_vmerge_vxm_u16m1(__riscv_vmv_v_x_u16m1(0, npyv_nlanes_u16), UINT16_MAX, A, npyv_nlanes_u16) +#define npyv__to_b32(A) __riscv_vmerge_vxm_u32m1(__riscv_vmv_v_x_u32m1(0, npyv_nlanes_u32), UINT32_MAX, A, npyv_nlanes_u32) +#define npyv__to_b64(A) __riscv_vmerge_vxm_u64m1(__riscv_vmv_v_x_u64m1(0, npyv_nlanes_u64), UINT64_MAX, A, npyv_nlanes_u64) + +NPY_FINLINE npy_uint64 npyv_tobits_b8(npyv_b8 a) +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b8_u64m1(npyv__from_b8(a))) & (npyv_nlanes_u8 == 64 ? ~0 : (1ULL << npyv_nlanes_u8) - 1); } +NPY_FINLINE npy_uint64 npyv_tobits_b16(npyv_b16 a) +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b16_u64m1(npyv__from_b16(a))) & ((1ULL << npyv_nlanes_u16) - 1); } +NPY_FINLINE npy_uint64 npyv_tobits_b32(npyv_b32 a) +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b32_u64m1(npyv__from_b32(a))) & ((1ULL << npyv_nlanes_u32) - 1); } +NPY_FINLINE npy_uint64 npyv_tobits_b64(npyv_b64 a) +{ return __riscv_vmv_x(__riscv_vreinterpret_v_b64_u64m1(npyv__from_b64(a))) & ((1ULL << npyv_nlanes_u64) - 1); } + +//expand +NPY_FINLINE npyv_u16x2 npyv_expand_u16_u8(npyv_u8 data) +{ + vuint16m2_t ext = __riscv_vzext_vf2(data, npyv_nlanes_u8); + return (npyv_u16x2){{ + __riscv_vget_v_u16m2_u16m1(ext, 0), + __riscv_vget_v_u16m2_u16m1(ext, 1) + }}; +} + +NPY_FINLINE npyv_u32x2 npyv_expand_u32_u16(npyv_u16 data) +{ + vuint32m2_t ext = __riscv_vzext_vf2(data, npyv_nlanes_u16); + return (npyv_u32x2){{ + __riscv_vget_v_u32m2_u32m1(ext, 0), + __riscv_vget_v_u32m2_u32m1(ext, 1) + }}; +} + +// pack two 16-bit boolean into one 8-bit boolean vector +NPY_FINLINE npyv_b8 npyv_pack_b8_b16(npyv_b16 a, npyv_b16 b) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b16(b) << npyv_nlanes_u16 | + npyv_tobits_b16(a), 1 + ))); +} + +// pack four 32-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b32(npyv_b32 a, npyv_b32 b, npyv_b32 c, npyv_b32 d) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b32(d) << (npyv_nlanes_u32 * 3) | + npyv_tobits_b32(c) << (npyv_nlanes_u32 * 2) | + npyv_tobits_b32(b) << npyv_nlanes_u32 | + npyv_tobits_b32(a), 1 + ))); +} + + // pack eight 64-bit boolean vectors into one 8-bit boolean vector +NPY_FINLINE npyv_b8 +npyv_pack_b8_b64(npyv_b64 a, npyv_b64 b, npyv_b64 c, npyv_b64 d, + npyv_b64 e, npyv_b64 f, npyv_b64 g, npyv_b64 h) +{ + return npyv__to_b8(__riscv_vreinterpret_v_u64m1_b8(__riscv_vmv_s_x_u64m1( + npyv_tobits_b64(h) << (npyv_nlanes_u64 * 7) | + npyv_tobits_b64(g) << (npyv_nlanes_u64 * 6) | + npyv_tobits_b64(f) << (npyv_nlanes_u64 * 5) | + npyv_tobits_b64(e) << (npyv_nlanes_u64 * 4) | + npyv_tobits_b64(d) << (npyv_nlanes_u64 * 3) | + npyv_tobits_b64(c) << (npyv_nlanes_u64 * 2) | + npyv_tobits_b64(b) << npyv_nlanes_u64 | + npyv_tobits_b64(a), 1 + ))); +} + +// round to nearest integer +NPY_FINLINE npyv_s32 npyv_round_s32_f32(npyv_f32 a) +{ + // (round-to-nearest-even) + return __riscv_vfcvt_x_f_v_i32m1(a, npyv_nlanes_s32); +} + +NPY_FINLINE npyv_s32 npyv_round_s32_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vfncvt_x_f_w_i32m1(__riscv_vcreate_v_f64m1_f64m2(a, b), npyv_nlanes_s32); +} + +#endif // _NPY_SIMD_RVV_CVT_H diff --git a/numpy/_core/src/common/simd/rvv/math.h b/numpy/_core/src/common/simd/rvv/math.h new file mode 100644 index 000000000000..de50e2c573b9 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/math.h @@ -0,0 +1,252 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MATH_H +#define _NPY_SIMD_RVV_MATH_H + +#include +#include + +/*************************** + * Elementary + ***************************/ +NPY_FINLINE npyv_f32 npyv_abs_f32(npyv_f32 a) +{ return __riscv_vfabs_v_f32m1(a, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_abs_f64(npyv_f64 a) +{ return __riscv_vfabs_v_f64m1(a, npyv_nlanes_f64); } + +// Square +NPY_FINLINE npyv_f32 npyv_square_f32(npyv_f32 a) +{ return __riscv_vfmul_vv_f32m1(a, a, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_square_f64(npyv_f64 a) +{ return __riscv_vfmul_vv_f64m1(a, a, npyv_nlanes_f64); } + +// Square root +NPY_FINLINE npyv_f32 npyv_sqrt_f32(npyv_f32 a) +{ return __riscv_vfsqrt_v_f32m1(a, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_sqrt_f64(npyv_f64 a) +{ return __riscv_vfsqrt_v_f64m1(a, npyv_nlanes_f64); } + +// Reciprocal +NPY_FINLINE npyv_f32 npyv_recip_f32(npyv_f32 a) +{ return __riscv_vfrdiv_vf_f32m1(a, 1.0f, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_recip_f64(npyv_f64 a) +{ return __riscv_vfrdiv_vf_f64m1(a, 1.0 , npyv_nlanes_f64); } + +// Maximum +NPY_FINLINE npyv_f32 npyv_max_f32(npyv_f32 a, npyv_f32 b) +{ return __riscv_vfmax_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_max_f64(npyv_f64 a, npyv_f64 b) +{ return __riscv_vfmax_vv_f64m1(a, b, npyv_nlanes_f64); } + +// Max, NaN-suppressing +#define npyv_maxp_f32 npyv_max_f32 +#define npyv_maxp_f64 npyv_max_f64 + +// Max, NaN-propagating +NPY_FINLINE npyv_f32 npyv_maxn_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vfmax_vv_f32m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f32), npyv_nlanes_f32), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32), + npyv_nlanes_f32 + ); +} +NPY_FINLINE npyv_f64 npyv_maxn_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vfmax_vv_f64m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f64), npyv_nlanes_f64), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64), + npyv_nlanes_f64 + ); +} + +// Maximum, integer operations +NPY_FINLINE npyv_u8 npyv_max_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vmaxu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_max_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vmax_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_max_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vmaxu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_max_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vmax_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_max_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vmaxu_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_max_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vmax_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_max_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vmaxu_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_max_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vmax_vv_i64m1(a, b, npyv_nlanes_s64); } + +// Minimum +NPY_FINLINE npyv_f32 npyv_min_f32(npyv_f32 a, npyv_f32 b) +{ return __riscv_vfmin_vv_f32m1(a, b, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_min_f64(npyv_f64 a, npyv_f64 b) +{ return __riscv_vfmin_vv_f64m1(a, b, npyv_nlanes_f64); } + +// Min, NaN-suppressing +#define npyv_minp_f32 npyv_min_f32 +#define npyv_minp_f64 npyv_min_f64 + +// Min, NaN-propagating +NPY_FINLINE npyv_f32 npyv_minn_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vfmin_vv_f32m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f32), npyv_nlanes_f32), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32), + npyv_nlanes_f32 + ); +} +NPY_FINLINE npyv_f64 npyv_minn_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vfmin_vv_f64m1( + __riscv_vmerge(b, a, __riscv_vmfeq(b, b, npyv_nlanes_f64), npyv_nlanes_f64), + __riscv_vmerge(a, b, __riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64), + npyv_nlanes_f64 + ); +} + +// Minimum, integer operations +NPY_FINLINE npyv_u8 npyv_min_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vminu_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_min_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vmin_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_min_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vminu_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_min_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vmin_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_min_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vminu_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_min_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vmin_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_min_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vminu_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_min_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vmin_vv_i64m1(a, b, npyv_nlanes_s64); } + +// reduce min/max for all data types +// Maximum reductions +NPY_FINLINE uint8_t npyv_reduce_max_u8(npyv_u8 a) +{ return __riscv_vmv_x_s_u8m1_u8(__riscv_vredmaxu_vs_u8m1_u8m1(a, __riscv_vmv_s_x_u8m1(0, 1), npyv_nlanes_u8)); } +NPY_FINLINE int8_t npyv_reduce_max_s8(npyv_s8 a) +{ return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmax_vs_i8m1_i8m1(a, __riscv_vmv_s_x_i8m1(INT8_MIN, 1), npyv_nlanes_s8)); } +NPY_FINLINE uint16_t npyv_reduce_max_u16(npyv_u16 a) +{ return __riscv_vmv_x_s_u16m1_u16(__riscv_vredmaxu_vs_u16m1_u16m1(a, __riscv_vmv_s_x_u16m1(0, 1), npyv_nlanes_u16)); } +NPY_FINLINE int16_t npyv_reduce_max_s16(npyv_s16 a) +{ return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmax_vs_i16m1_i16m1(a, __riscv_vmv_s_x_i16m1(INT16_MIN, 1), npyv_nlanes_s16)); } +NPY_FINLINE uint32_t npyv_reduce_max_u32(npyv_u32 a) +{ return __riscv_vmv_x_s_u32m1_u32(__riscv_vredmaxu_vs_u32m1_u32m1(a, __riscv_vmv_s_x_u32m1(0, 1), npyv_nlanes_u32)); } +NPY_FINLINE int32_t npyv_reduce_max_s32(npyv_s32 a) +{ return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmax_vs_i32m1_i32m1(a, __riscv_vmv_s_x_i32m1(INT32_MIN, 1), npyv_nlanes_s32)); } +NPY_FINLINE uint64_t npyv_reduce_max_u64(npyv_u64 a) +{ return __riscv_vmv_x_s_u64m1_u64(__riscv_vredmaxu_vs_u64m1_u64m1(a, __riscv_vmv_s_x_u64m1(0, 1), npyv_nlanes_u64)); } +NPY_FINLINE int64_t npyv_reduce_max_s64(npyv_s64 a) +{ return __riscv_vmv_x_s_i64m1_i64(__riscv_vredmax_vs_i64m1_i64m1(a, __riscv_vmv_s_x_i64m1(INT64_MIN, 1), npyv_nlanes_s64)); } + +// Floating-point maximum reductions +NPY_FINLINE float npyv_reduce_max_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32) != -1 ? __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmax_vs_f32m1_f32m1(a, __riscv_vfmv_s_f_f32m1(-INFINITY, 1), npyv_nlanes_f32)) : NAN; } +NPY_FINLINE double npyv_reduce_max_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64) != -1 ? __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmax_vs_f64m1_f64m1(a, __riscv_vfmv_s_f_f64m1(-INFINITY, 1), npyv_nlanes_f64)) : NAN; } + +// NaN-suppressing maximum reductions +#define npyv_reduce_maxp_f32 npyv_reduce_max_f32 +#define npyv_reduce_maxp_f64 npyv_reduce_max_f64 + +// NaN-propagating maximum reductions +NPY_FINLINE float npyv_reduce_maxn_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f32), npyv_nlanes_f32) == -1 ? npyv_reduce_max_f32(a) : NAN; } +NPY_FINLINE double npyv_reduce_maxn_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f64), npyv_nlanes_f64) == -1 ? npyv_reduce_max_f64(a) : NAN; } + +// Minimum reductions +NPY_FINLINE uint8_t npyv_reduce_min_u8(npyv_u8 a) +{ return __riscv_vmv_x_s_u8m1_u8(__riscv_vredminu_vs_u8m1_u8m1(a, __riscv_vmv_s_x_u8m1(UINT8_MAX, 1), npyv_nlanes_u8)); } +NPY_FINLINE int8_t npyv_reduce_min_s8(npyv_s8 a) +{ return __riscv_vmv_x_s_i8m1_i8(__riscv_vredmin_vs_i8m1_i8m1(a, __riscv_vmv_s_x_i8m1(INT8_MAX, 1), npyv_nlanes_s8)); } +NPY_FINLINE uint16_t npyv_reduce_min_u16(npyv_u16 a) +{ return __riscv_vmv_x_s_u16m1_u16(__riscv_vredminu_vs_u16m1_u16m1(a, __riscv_vmv_s_x_u16m1(UINT16_MAX, 1), npyv_nlanes_u16)); } +NPY_FINLINE int16_t npyv_reduce_min_s16(npyv_s16 a) +{ return __riscv_vmv_x_s_i16m1_i16(__riscv_vredmin_vs_i16m1_i16m1(a, __riscv_vmv_s_x_i16m1(INT16_MAX, 1), npyv_nlanes_s16)); } +NPY_FINLINE uint32_t npyv_reduce_min_u32(npyv_u32 a) +{ return __riscv_vmv_x_s_u32m1_u32(__riscv_vredminu_vs_u32m1_u32m1(a, __riscv_vmv_s_x_u32m1(UINT32_MAX, 1), npyv_nlanes_u32)); } +NPY_FINLINE int32_t npyv_reduce_min_s32(npyv_s32 a) +{ return __riscv_vmv_x_s_i32m1_i32(__riscv_vredmin_vs_i32m1_i32m1(a, __riscv_vmv_s_x_i32m1(INT32_MAX, 1), npyv_nlanes_s32)); } +NPY_FINLINE uint64_t npyv_reduce_min_u64(npyv_u64 a) +{ return __riscv_vmv_x_s_u64m1_u64(__riscv_vredminu_vs_u64m1_u64m1(a, __riscv_vmv_s_x_u64m1(UINT64_MAX, 1), npyv_nlanes_u64)); } +NPY_FINLINE int64_t npyv_reduce_min_s64(npyv_s64 a) +{ return __riscv_vmv_x_s_i64m1_i64(__riscv_vredmin_vs_i64m1_i64m1(a, __riscv_vmv_s_x_i64m1(INT64_MAX, 1), npyv_nlanes_s64)); } + +// Floating-point minimum reductions +NPY_FINLINE float npyv_reduce_min_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f32), npyv_nlanes_f32) != -1 ? __riscv_vfmv_f_s_f32m1_f32(__riscv_vfredmin_vs_f32m1_f32m1(a, __riscv_vfmv_s_f_f32m1(INFINITY, 1), npyv_nlanes_f32)) : NAN; } +NPY_FINLINE double npyv_reduce_min_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfeq(a, a, npyv_nlanes_f64), npyv_nlanes_f64) != -1 ? __riscv_vfmv_f_s_f64m1_f64(__riscv_vfredmin_vs_f64m1_f64m1(a, __riscv_vfmv_s_f_f64m1(INFINITY, 1), npyv_nlanes_f64)) : NAN; } + +// NaN-suppressing minimum reductions +#define npyv_reduce_minp_f32 npyv_reduce_min_f32 +#define npyv_reduce_minp_f64 npyv_reduce_min_f64 + +// NaN-propagating minimum reductions +NPY_FINLINE float npyv_reduce_minn_f32(npyv_f32 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f32), npyv_nlanes_f32) == -1 ? npyv_reduce_min_f32(a) : NAN; } +NPY_FINLINE double npyv_reduce_minn_f64(npyv_f64 a) +{ return __riscv_vfirst(__riscv_vmfne(a, a, npyv_nlanes_f64), npyv_nlanes_f64) == -1 ? npyv_reduce_min_f64(a) : NAN; } + +#define NPYV_IMPL_RVV_FCVT(TYPE, FRM) \ + NPY_FINLINE npyv_f32 npyv_##TYPE##_f32(npyv_f32 a) \ + { \ + const int vl = npyv_nlanes_f32; \ + const vfloat32m1_t b = __riscv_vmerge( \ + a, \ + __riscv_vfcvt_f(__riscv_vfcvt_x_f_v_i32m1_rm( \ + a, FRM, vl), vl \ + ), \ + __riscv_vmfle( \ + __riscv_vfabs(a, vl), 1e9, vl \ + ), vl \ + ); \ + feclearexcept(FE_INVALID); \ + return __riscv_vreinterpret_f32m1(__riscv_vor( \ + __riscv_vand( \ + __riscv_vreinterpret_u32m1(a), \ + 1 << 31, vl \ + ), \ + __riscv_vreinterpret_u32m1(b), vl \ + )); \ + } \ + NPY_FINLINE npyv_f64 npyv_##TYPE##_f64(npyv_f64 a) \ + { \ + const int vl = npyv_nlanes_f64; \ + const vfloat64m1_t b = __riscv_vmerge( \ + a, \ + __riscv_vfcvt_f(__riscv_vfcvt_x_f_v_i64m1_rm( \ + a, FRM, vl), vl \ + ), \ + __riscv_vmfle( \ + __riscv_vfabs(a, vl), 1e18, vl \ + ), vl \ + ); \ + feclearexcept(FE_INVALID); \ + return __riscv_vreinterpret_f64m1(__riscv_vor( \ + __riscv_vand( \ + __riscv_vreinterpret_u64m1(a), \ + 1ULL << 63, vl \ + ), \ + __riscv_vreinterpret_u64m1(b), vl \ + )); \ + } + +// round to nearest integer even +NPYV_IMPL_RVV_FCVT(rint, __RISCV_FRM_RNE) +// trunc +NPYV_IMPL_RVV_FCVT(trunc, __RISCV_FRM_RTZ) +// ceil +NPYV_IMPL_RVV_FCVT(ceil, __RISCV_FRM_RUP) +// floor +NPYV_IMPL_RVV_FCVT(floor, __RISCV_FRM_RDN) +#undef NPYV_IMPL_RVV_FCVT + +#endif // _NPY_SIMD_RVV_MATH_H diff --git a/numpy/_core/src/common/simd/rvv/memory.h b/numpy/_core/src/common/simd/rvv/memory.h new file mode 100644 index 000000000000..84d343d0aa01 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/memory.h @@ -0,0 +1,639 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MEMORY_H +#define _NPY_SIMD_RVV_MEMORY_H + +#include "misc.h" + +/*************************** + * load/store + ***************************/ +// GCC requires literal type definitions for pointers types otherwise it causes ambiguous errors + +// uint8_t +NPY_FINLINE npyv_u8 npyv_load_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } +NPY_FINLINE npyv_u8 npyv_loada_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } +NPY_FINLINE npyv_u8 npyv_loads_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1((const uint8_t*)ptr, npyv_nlanes_u8); } +NPY_FINLINE npyv_u8 npyv_loadl_u8(const npyv_lanetype_u8 *ptr) +{ return __riscv_vle8_v_u8m1_tu(__riscv_vmv_v_x_u8m1(0, npyv_nlanes_u8), (const uint8_t*)ptr, npyv_nlanes_u8 / 2); } + +NPY_FINLINE void npyv_store_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } +NPY_FINLINE void npyv_storea_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } +NPY_FINLINE void npyv_stores_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8); } +NPY_FINLINE void npyv_storel_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, vec, npyv_nlanes_u8 / 2); } +NPY_FINLINE void npyv_storeh_u8(npyv_lanetype_u8 *ptr, npyv_u8 vec) +{ __riscv_vse8_v_u8m1((uint8_t*)ptr, __riscv_vslidedown_vx_u8m1(vec, npyv_nlanes_u8 / 2, npyv_nlanes_u8), npyv_nlanes_u8 / 2); } + +// int8_t +NPY_FINLINE npyv_s8 npyv_load_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } +NPY_FINLINE npyv_s8 npyv_loada_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } +NPY_FINLINE npyv_s8 npyv_loads_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1((const int8_t*)ptr, npyv_nlanes_s8); } +NPY_FINLINE npyv_s8 npyv_loadl_s8(const npyv_lanetype_s8 *ptr) +{ return __riscv_vle8_v_i8m1_tu(__riscv_vmv_v_x_i8m1(0, npyv_nlanes_s8), (const int8_t*)ptr, npyv_nlanes_s8 / 2); } + +NPY_FINLINE void npyv_store_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } +NPY_FINLINE void npyv_storea_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } +NPY_FINLINE void npyv_stores_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8); } +NPY_FINLINE void npyv_storel_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, vec, npyv_nlanes_s8 / 2); } +NPY_FINLINE void npyv_storeh_s8(npyv_lanetype_s8 *ptr, npyv_s8 vec) +{ __riscv_vse8_v_i8m1((int8_t*)ptr, __riscv_vslidedown_vx_i8m1(vec, npyv_nlanes_s8 / 2, npyv_nlanes_s8), npyv_nlanes_s8 / 2); } + +// uint16_t +NPY_FINLINE npyv_u16 npyv_load_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } +NPY_FINLINE npyv_u16 npyv_loada_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } +NPY_FINLINE npyv_u16 npyv_loads_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1((const uint16_t*)ptr, npyv_nlanes_u16); } +NPY_FINLINE npyv_u16 npyv_loadl_u16(const npyv_lanetype_u16 *ptr) +{ return __riscv_vle16_v_u16m1_tu(__riscv_vmv_v_x_u16m1(0, npyv_nlanes_u16), (const uint16_t*)ptr, npyv_nlanes_u16 / 2); } + +NPY_FINLINE void npyv_store_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } +NPY_FINLINE void npyv_storea_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } +NPY_FINLINE void npyv_stores_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16); } +NPY_FINLINE void npyv_storel_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, vec, npyv_nlanes_u16 / 2); } +NPY_FINLINE void npyv_storeh_u16(npyv_lanetype_u16 *ptr, npyv_u16 vec) +{ __riscv_vse16_v_u16m1((uint16_t*)ptr, __riscv_vslidedown_vx_u16m1(vec, npyv_nlanes_u16 / 2, npyv_nlanes_u16), npyv_nlanes_u16 / 2); } + +// int16_t +NPY_FINLINE npyv_s16 npyv_load_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } +NPY_FINLINE npyv_s16 npyv_loada_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } +NPY_FINLINE npyv_s16 npyv_loads_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1((const int16_t*)ptr, npyv_nlanes_s16); } +NPY_FINLINE npyv_s16 npyv_loadl_s16(const npyv_lanetype_s16 *ptr) +{ return __riscv_vle16_v_i16m1_tu(__riscv_vmv_v_x_i16m1(0, npyv_nlanes_s16), (const int16_t*)ptr, npyv_nlanes_s16 / 2); } + +NPY_FINLINE void npyv_store_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } +NPY_FINLINE void npyv_storea_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } +NPY_FINLINE void npyv_stores_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16); } +NPY_FINLINE void npyv_storel_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, vec, npyv_nlanes_s16 / 2); } +NPY_FINLINE void npyv_storeh_s16(npyv_lanetype_s16 *ptr, npyv_s16 vec) +{ __riscv_vse16_v_i16m1((int16_t*)ptr, __riscv_vslidedown_vx_i16m1(vec, npyv_nlanes_s16 / 2, npyv_nlanes_s16), npyv_nlanes_s16 / 2); } + +// uint32_t +NPY_FINLINE npyv_u32 npyv_load_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } +NPY_FINLINE npyv_u32 npyv_loada_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } +NPY_FINLINE npyv_u32 npyv_loads_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1((const uint32_t*)ptr, npyv_nlanes_u32); } +NPY_FINLINE npyv_u32 npyv_loadl_u32(const npyv_lanetype_u32 *ptr) +{ return __riscv_vle32_v_u32m1_tu(__riscv_vmv_v_x_u32m1(0, npyv_nlanes_u32), (const uint32_t*)ptr, npyv_nlanes_u32 / 2); } + +NPY_FINLINE void npyv_store_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } +NPY_FINLINE void npyv_storea_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } +NPY_FINLINE void npyv_stores_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32); } +NPY_FINLINE void npyv_storel_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, vec, npyv_nlanes_u32 / 2); } +NPY_FINLINE void npyv_storeh_u32(npyv_lanetype_u32 *ptr, npyv_u32 vec) +{ __riscv_vse32_v_u32m1((uint32_t*)ptr, __riscv_vslidedown_vx_u32m1(vec, npyv_nlanes_u32 / 2, npyv_nlanes_u32), npyv_nlanes_u32 / 2); } + +// int32_t +NPY_FINLINE npyv_s32 npyv_load_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } +NPY_FINLINE npyv_s32 npyv_loada_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } +NPY_FINLINE npyv_s32 npyv_loads_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1((const int32_t*)ptr, npyv_nlanes_s32); } +NPY_FINLINE npyv_s32 npyv_loadl_s32(const npyv_lanetype_s32 *ptr) +{ return __riscv_vle32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(0, npyv_nlanes_s32), (const int32_t*)ptr, npyv_nlanes_s32 / 2); } + +NPY_FINLINE void npyv_store_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } +NPY_FINLINE void npyv_storea_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } +NPY_FINLINE void npyv_stores_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32); } +NPY_FINLINE void npyv_storel_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, vec, npyv_nlanes_s32 / 2); } +NPY_FINLINE void npyv_storeh_s32(npyv_lanetype_s32 *ptr, npyv_s32 vec) +{ __riscv_vse32_v_i32m1((int32_t*)ptr, __riscv_vslidedown_vx_i32m1(vec, npyv_nlanes_s32 / 2, npyv_nlanes_s32), npyv_nlanes_s32 / 2); } + +// uint64_t +NPY_FINLINE npyv_u64 npyv_load_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } +NPY_FINLINE npyv_u64 npyv_loada_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } +NPY_FINLINE npyv_u64 npyv_loads_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1((const uint64_t*)ptr, npyv_nlanes_u64); } +NPY_FINLINE npyv_u64 npyv_loadl_u64(const npyv_lanetype_u64 *ptr) +{ return __riscv_vle64_v_u64m1_tu(__riscv_vmv_v_x_u64m1(0, npyv_nlanes_u64), (const uint64_t*)ptr, npyv_nlanes_u64 / 2); } + +NPY_FINLINE void npyv_store_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } +NPY_FINLINE void npyv_storea_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } +NPY_FINLINE void npyv_stores_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64); } +NPY_FINLINE void npyv_storel_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, vec, npyv_nlanes_u64 / 2); } +NPY_FINLINE void npyv_storeh_u64(npyv_lanetype_u64 *ptr, npyv_u64 vec) +{ __riscv_vse64_v_u64m1((uint64_t*)ptr, __riscv_vslidedown_vx_u64m1(vec, npyv_nlanes_u64 / 2, npyv_nlanes_u64), npyv_nlanes_u64 / 2); } + +// int64_t +NPY_FINLINE npyv_s64 npyv_load_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } +NPY_FINLINE npyv_s64 npyv_loada_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } +NPY_FINLINE npyv_s64 npyv_loads_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1((const int64_t*)ptr, npyv_nlanes_s64); } +NPY_FINLINE npyv_s64 npyv_loadl_s64(const npyv_lanetype_s64 *ptr) +{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(0, npyv_nlanes_s64), (const int64_t*)ptr, npyv_nlanes_s64 / 2); } + +NPY_FINLINE void npyv_store_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } +NPY_FINLINE void npyv_storea_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } +NPY_FINLINE void npyv_stores_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64); } +NPY_FINLINE void npyv_storel_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, vec, npyv_nlanes_s64 / 2); } +NPY_FINLINE void npyv_storeh_s64(npyv_lanetype_s64 *ptr, npyv_s64 vec) +{ __riscv_vse64_v_i64m1((int64_t*)ptr, __riscv_vslidedown_vx_i64m1(vec, npyv_nlanes_s64 / 2, npyv_nlanes_s64), npyv_nlanes_s64 / 2); } + +// float +NPY_FINLINE npyv_f32 npyv_load_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } +NPY_FINLINE npyv_f32 npyv_loada_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } +NPY_FINLINE npyv_f32 npyv_loads_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1((const float*)ptr, npyv_nlanes_f32); } +NPY_FINLINE npyv_f32 npyv_loadl_f32(const npyv_lanetype_f32 *ptr) +{ return __riscv_vle32_v_f32m1_tu(__riscv_vfmv_v_f_f32m1(0.0f, npyv_nlanes_f32), (const float*)ptr, npyv_nlanes_f32 / 2); } + +NPY_FINLINE void npyv_store_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } +NPY_FINLINE void npyv_storea_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } +NPY_FINLINE void npyv_stores_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32); } +NPY_FINLINE void npyv_storel_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, vec, npyv_nlanes_f32 / 2); } +NPY_FINLINE void npyv_storeh_f32(npyv_lanetype_f32 *ptr, npyv_f32 vec) +{ __riscv_vse32_v_f32m1((float*)ptr, __riscv_vslidedown_vx_f32m1(vec, npyv_nlanes_f32 / 2, npyv_nlanes_f32), npyv_nlanes_f32 / 2); } + +// double +NPY_FINLINE npyv_f64 npyv_load_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_loada_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_loads_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1((const double*)ptr, npyv_nlanes_f64); } +NPY_FINLINE npyv_f64 npyv_loadl_f64(const npyv_lanetype_f64 *ptr) +{ return __riscv_vle64_v_f64m1_tu(__riscv_vfmv_v_f_f64m1(0.0, npyv_nlanes_f64), (const double*)ptr, npyv_nlanes_f64 / 2); } + +NPY_FINLINE void npyv_store_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } +NPY_FINLINE void npyv_storea_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } +NPY_FINLINE void npyv_stores_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64); } +NPY_FINLINE void npyv_storel_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, vec, npyv_nlanes_f64 / 2); } +NPY_FINLINE void npyv_storeh_f64(npyv_lanetype_f64 *ptr, npyv_f64 vec) +{ __riscv_vse64_v_f64m1((double*)ptr, __riscv_vslidedown_vx_f64m1(vec, npyv_nlanes_f64 / 2, npyv_nlanes_f64), npyv_nlanes_f64 / 2); } + + +/*************************** + * Non-contiguous Load + ***************************/ +NPY_FINLINE npyv_s32 npyv_loadn_s32(const npy_int32 *ptr, npy_intp stride) +{ return __riscv_vlse32_v_i32m1((const int32_t*)ptr, stride * sizeof(int32_t), npyv_nlanes_s32); } +NPY_FINLINE npyv_u32 npyv_loadn_u32(const npy_uint32 *ptr, npy_intp stride) +{ return __riscv_vlse32_v_u32m1((const uint32_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u32); } +NPY_FINLINE npyv_f32 npyv_loadn_f32(const float *ptr, npy_intp stride) +{ return __riscv_vlse32_v_f32m1((const float*)ptr, stride * sizeof(float), npyv_nlanes_f32); } + +NPY_FINLINE npyv_s64 npyv_loadn_s64(const npy_int64 *ptr, npy_intp stride) +{ return __riscv_vlse64_v_i64m1((const int64_t*)ptr, stride * sizeof(int64_t), npyv_nlanes_s64); } +NPY_FINLINE npyv_u64 npyv_loadn_u64(const npy_uint64 *ptr, npy_intp stride) +{ return __riscv_vlse64_v_u64m1((const uint64_t*)ptr, stride * sizeof(uint64_t), npyv_nlanes_u64); } +NPY_FINLINE npyv_f64 npyv_loadn_f64(const double *ptr, npy_intp stride) +{ return __riscv_vlse64_v_f64m1((const double*)ptr, stride * sizeof(double), npyv_nlanes_f64); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_u32 npyv_loadn2_u32(const npy_uint32 *ptr, npy_intp stride) +{ + uint32_t v[npyv_nlanes_u32]; + __riscv_vsseg2e32(v, __riscv_vlsseg2e32_v_u32mf2x2((const uint32_t*)ptr, stride * sizeof(uint32_t), npyv_nlanes_u32 / 2), npyv_nlanes_u32 / 2); + return __riscv_vle32_v_u32m1(v, npyv_nlanes_u32); +} +NPY_FINLINE npyv_s32 npyv_loadn2_s32(const npy_int32 *ptr, npy_intp stride) +{ return npyv_reinterpret_s32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } +NPY_FINLINE npyv_f32 npyv_loadn2_f32(const float *ptr, npy_intp stride) +{ return npyv_reinterpret_f32_u32(npyv_loadn2_u32((const npy_uint32*)ptr, stride)); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_u64 npyv_loadn2_u64(const npy_uint64 *ptr, npy_intp stride) +{ + uint64_t v[npyv_nlanes_u64]; + __riscv_vsseg2e64(v, __riscv_vlsseg2e64_v_u64m1x2((const uint64_t*)ptr, stride * sizeof(uint64_t), npyv_nlanes_u64 / 2), npyv_nlanes_u64 / 2); + return __riscv_vle64_v_u64m1(v, npyv_nlanes_u64); +} +NPY_FINLINE npyv_s64 npyv_loadn2_s64(const npy_int64 *ptr, npy_intp stride) +{ return npyv_reinterpret_s64_u64(npyv_loadn2_u64((const npy_uint64*)ptr, stride)); } + +NPY_FINLINE npyv_f64 npyv_loadn2_f64(const double *ptr, npy_intp stride) +{ return npyv_reinterpret_f64_u64(npyv_loadn2_u64((const npy_uint64*)ptr, stride)); } + +/*************************** + * Non-contiguous Store + ***************************/ +NPY_FINLINE void npyv_storen_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ __riscv_vsse32((int32_t*)ptr, stride * sizeof(int32_t), a, npyv_nlanes_s32); } +NPY_FINLINE void npyv_storen_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ __riscv_vsse32((uint32_t*)ptr, stride * sizeof(uint32_t), a, npyv_nlanes_u32); } +NPY_FINLINE void npyv_storen_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ __riscv_vsse32((float*)ptr, stride * sizeof(float), a, npyv_nlanes_f32); } + +NPY_FINLINE void npyv_storen_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int64_t), a, npyv_nlanes_s64); } +NPY_FINLINE void npyv_storen_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ __riscv_vsse64((uint64_t*)ptr, stride * sizeof(uint64_t), a, npyv_nlanes_u64); } +NPY_FINLINE void npyv_storen_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ __riscv_vsse64((double*)ptr, stride * sizeof(double), a, npyv_nlanes_f64); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_u32(npy_uint32 *ptr, npy_intp stride, npyv_u32 a) +{ + uint32_t v[npyv_nlanes_u32]; + __riscv_vse32(v, a, npyv_nlanes_u32); + __riscv_vssseg2e32((uint32_t*)ptr, stride * sizeof(uint32_t), __riscv_vlseg2e32_v_u32mf2x2(v, npyv_nlanes_u32 / 2), npyv_nlanes_u32 / 2); +} +NPY_FINLINE void npyv_storen2_s32(npy_int32 *ptr, npy_intp stride, npyv_s32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_s32(a)); } +NPY_FINLINE void npyv_storen2_f32(float *ptr, npy_intp stride, npyv_f32 a) +{ npyv_storen2_u32((npy_uint32*)ptr, stride, npyv_reinterpret_u32_f32(a)); } + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_u64(npy_uint64 *ptr, npy_intp stride, npyv_u64 a) +{ + uint64_t v[npyv_nlanes_u64]; + __riscv_vse64(v, a, npyv_nlanes_u64); + __riscv_vssseg2e64((uint64_t*)ptr, stride * sizeof(uint64_t), __riscv_vlseg2e64_v_u64m1x2(v, npyv_nlanes_u64 / 2), npyv_nlanes_u64 / 2); +} +NPY_FINLINE void npyv_storen2_s64(npy_int64 *ptr, npy_intp stride, npyv_s64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, npyv_reinterpret_u64_s64(a)); } +NPY_FINLINE void npyv_storen2_f64(double *ptr, npy_intp stride, npyv_f64 a) +{ npyv_storen2_u64((npy_uint64*)ptr, stride, npyv_reinterpret_u64_f64(a)); } + +/********************************* + * Partial Load + *********************************/ +//// 32 +NPY_FINLINE npyv_s32 npyv_load_till_s32(const npy_int32 *ptr, npy_uintp nlane, npy_int32 fill) +{ return __riscv_vle32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(fill, npyv_nlanes_s32), (const int32_t*)ptr, nlane); } +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_till_s32(ptr, nlane, 0); } + +NPY_FINLINE npyv_s64 npyv_load_till_s64(const npy_int64 *ptr, npy_uintp nlane, npy_int64 fill) +{ return __riscv_vle64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(fill, npyv_nlanes_s64), (const int64_t*)ptr, nlane); } +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_load_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ return npyv_load_till_s64(ptr, nlane, 0); } + +//// 64-bit nlane +NPY_FINLINE npyv_s32 npyv_load2_till_s32(const npy_int32 *ptr, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ + const vint32m1_t fill = __riscv_vreinterpret_i32m1(__riscv_vmv_v_x_i64m1((int64_t)fill_hi << 32 | fill_lo, npyv_nlanes_s64)); + return __riscv_vle32_v_i32m1_tu(fill, (const int32_t*)ptr, nlane * 2); +} +// fill zero to rest lanes +NPY_FINLINE npyv_s32 npyv_load2_tillz_s32(const npy_int32 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s32(ptr, nlane * 2); } + +//// 128-bit nlane +NPY_FINLINE npyv_s64 npyv_load2_till_s64(const npy_int64 *ptr, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ + const vint64m1_t fill = __riscv_vmerge(__riscv_vmv_v_x_i64m1(fill_lo, npyv_nlanes_s64), fill_hi, __riscv_vreinterpret_v_u8m1_b64(__riscv_vmv_v_x_u8m1(0xAA, npyv_nlanes_u8)), npyv_nlanes_s64); + return __riscv_vle64_v_i64m1_tu(fill, (const int64_t*)ptr, nlane * 2); +} +NPY_FINLINE npyv_s64 npyv_load2_tillz_s64(const npy_int64 *ptr, npy_uintp nlane) +{ return npyv_load_tillz_s64(ptr, nlane * 2); } + +/********************************* + * Non-contiguous partial load + *********************************/ +NPY_FINLINE npyv_s32 npyv_loadn_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npy_int32 fill) +{ return __riscv_vlse32_v_i32m1_tu(__riscv_vmv_v_x_i32m1(fill, npyv_nlanes_s32), (const int32_t*)ptr, stride * sizeof(int32_t), nlane); } +NPY_FINLINE npyv_s32 npyv_loadn_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s32(ptr, stride, nlane, 0); } + +NPY_FINLINE npyv_s64 npyv_loadn_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npy_int64 fill) +{ return __riscv_vlse64_v_i64m1_tu(__riscv_vmv_v_x_i64m1(fill, npyv_nlanes_s64), (const int64_t*)ptr, stride * sizeof(int64_t), nlane); } +// fill zero to rest lanes +NPY_FINLINE npyv_s64 npyv_loadn_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn_till_s64(ptr, stride, nlane, 0); } + +//// 64-bit load over 32-bit stride +NPY_FINLINE npyv_s32 npyv_loadn2_till_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane, + npy_int32 fill_lo, npy_int32 fill_hi) +{ +#if npyv_nlanes_s32 == 4 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s32 == 8 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s32 == 16 + int32_t v[npyv_nlanes_s32] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#endif + __riscv_vsseg2e32(v, __riscv_vlsseg2e32_v_i32mf2x2((const int32_t*)ptr, stride * sizeof(int32_t), nlane), nlane); + return __riscv_vle32_v_i32m1(v, npyv_nlanes_s32); +} +NPY_FINLINE npyv_s32 npyv_loadn2_tillz_s32(const npy_int32 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s32(ptr, stride, nlane, 0, 0); } + +//// 128-bit load over 64-bit stride +NPY_FINLINE npyv_s64 npyv_loadn2_till_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane, + npy_int64 fill_lo, npy_int64 fill_hi) +{ +#if npyv_nlanes_s64 == 4 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s64 == 8 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#elif npyv_nlanes_s64 == 16 + int64_t v[npyv_nlanes_s64] = { fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi, fill_lo, fill_hi }; +#endif + nlane = nlane < npyv_nlanes_s64 / 2 ? nlane : npyv_nlanes_s64 / 2; + __riscv_vsseg2e64(v, __riscv_vlsseg2e64_v_i64m1x2((const int64_t*)ptr, stride * sizeof(int64_t), nlane), nlane); + return __riscv_vle64_v_i64m1(v, npyv_nlanes_s64); +} +NPY_FINLINE npyv_s64 npyv_loadn2_tillz_s64(const npy_int64 *ptr, npy_intp stride, npy_uintp nlane) +{ return npyv_loadn2_till_s64(ptr, stride, nlane, 0, 0); } + +/********************************* + * Partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_store_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ __riscv_vse32((int32_t*)ptr, a, nlane); } + +//// 64 +NPY_FINLINE void npyv_store_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ __riscv_vse64((int64_t*)ptr, a, nlane); } + +//// 64-bit nlane +NPY_FINLINE void npyv_store2_till_s32(npy_int32 *ptr, npy_uintp nlane, npyv_s32 a) +{ npyv_store_till_s32(ptr, nlane * 2, a); } + +//// 128-bit nlane +NPY_FINLINE void npyv_store2_till_s64(npy_int64 *ptr, npy_uintp nlane, npyv_s64 a) +{ npyv_store_till_s64(ptr, nlane * 2, a); } + +/********************************* + * Non-contiguous partial store + *********************************/ +//// 32 +NPY_FINLINE void npyv_storen_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ __riscv_vsse32((int32_t*)ptr, stride * sizeof(int32_t), a, nlane); } + +//// 64 +NPY_FINLINE void npyv_storen_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ __riscv_vsse64((int64_t*)ptr, stride * sizeof(int64_t), a, nlane); } + +//// 64-bit store over 32-bit stride +NPY_FINLINE void npyv_storen2_till_s32(npy_int32 *ptr, npy_intp stride, npy_uintp nlane, npyv_s32 a) +{ + int32_t v[npyv_nlanes_s32]; + __riscv_vse32(v, a, npyv_nlanes_s32); + __riscv_vssseg2e32((int32_t*)ptr, stride * sizeof(int32_t), __riscv_vlseg2e32_v_i32mf2x2(v, nlane), nlane); +} + +//// 128-bit store over 64-bit stride +NPY_FINLINE void npyv_storen2_till_s64(npy_int64 *ptr, npy_intp stride, npy_uintp nlane, npyv_s64 a) +{ + int64_t v[npyv_nlanes_s64]; + nlane = nlane < npyv_nlanes_s64 / 2 ? nlane : npyv_nlanes_s64 / 2; + __riscv_vse64(v, a, npyv_nlanes_s64); + __riscv_vssseg2e64((int64_t*)ptr, stride * sizeof(int64_t), __riscv_vlseg2e64_v_i64m1x2(v, nlane), nlane); +} + +/***************************************************************** + * Implement partial load/store for u32/f32/u64/f64... via casting + *****************************************************************/ +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill) \ + { \ + union { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + } pun; \ + pun.from_##F_SFX = fill; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(u64, s64) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES(f64, s64) +#undef NPYV_IMPL_RVV_REST_PARTIAL_TYPES + +// 128-bit/64-bit stride +#define NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(F_SFX, T_SFX) \ + NPY_FINLINE npyv_##F_SFX npyv_load2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane, pun_lo.to_##T_SFX, pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_till_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, \ + npyv_lanetype_##F_SFX fill_lo, npyv_lanetype_##F_SFX fill_hi) \ + { \ + union pun { \ + npyv_lanetype_##F_SFX from_##F_SFX; \ + npyv_lanetype_##T_SFX to_##T_SFX; \ + }; \ + union pun pun_lo; \ + union pun pun_hi; \ + pun_lo.from_##F_SFX = fill_lo; \ + pun_hi.from_##F_SFX = fill_hi; \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_till_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane, pun_lo.to_##T_SFX, \ + pun_hi.to_##T_SFX \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_load2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_load2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, nlane \ + )); \ + } \ + NPY_FINLINE npyv_##F_SFX npyv_loadn2_tillz_##F_SFX \ + (const npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane) \ + { \ + return npyv_reinterpret_##F_SFX##_##T_SFX(npyv_loadn2_tillz_##T_SFX( \ + (const npyv_lanetype_##T_SFX *)ptr, stride, nlane \ + )); \ + } \ + NPY_FINLINE void npyv_store2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_store2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } \ + NPY_FINLINE void npyv_storen2_till_##F_SFX \ + (npyv_lanetype_##F_SFX *ptr, npy_intp stride, npy_uintp nlane, npyv_##F_SFX a) \ + { \ + npyv_storen2_till_##T_SFX( \ + (npyv_lanetype_##T_SFX *)ptr, stride, nlane, \ + npyv_reinterpret_##T_SFX##_##F_SFX(a) \ + ); \ + } + +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f32, s32) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(u64, s64) +NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR(f64, s64) +#undef NPYV_IMPL_RVV_REST_PARTIAL_TYPES_PAIR + +/************************************************************ + * de-interleave load / interleave contiguous store + ************************************************************/ +// two channels +#define NPYV_IMPL_RVV_MEM_INTERLEAVE(SFX, R_SFX, EEW) \ + NPY_FINLINE npyv_##SFX##x2 npyv_load_##SFX##x2( \ + const npyv_lanetype_##SFX *ptr \ + ) { \ + npyv__##SFX##x2 v = __riscv_vlseg2##EEW##_v_##R_SFX##m1x2( \ + ptr, npyv_nlanes_##SFX \ + ); \ + return (npyv_##SFX##x2){{ \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(v, 0), \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(v, 1) \ + }}; \ + } \ + NPY_FINLINE void npyv_store_##SFX##x2( \ + npyv_lanetype_##SFX *ptr, npyv_##SFX##x2 v \ + ) { \ + __riscv_vsseg2##EEW( \ + ptr, \ + __riscv_vcreate_v_##R_SFX##m1x2(v.val[0], v.val[1]), \ + npyv_nlanes_##SFX \ + ); \ + } + +NPYV_IMPL_RVV_MEM_INTERLEAVE(u8, u8, e8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s8, i8, e8) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u16, u16, e16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s16, i16, e16) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u32, u32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s32, i32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(u64, u64, e64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(s64, i64, e64) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f32, f32, e32) +NPYV_IMPL_RVV_MEM_INTERLEAVE(f64, f64, e64) +#undef NPYV_IMPL_RVV_MEM_INTERLEAVE + +/********************************* + * Lookup table + *********************************/ +// uses vector as indexes into a table +// that contains 32 elements of uint32. +NPY_FINLINE npyv_u32 npyv_lut32_u32(const npy_uint32 *table, npyv_u32 idx) +{ return __riscv_vloxei32_v_u32m1((const uint32_t*)table, __riscv_vmul(idx, sizeof(uint32_t), npyv_nlanes_u32), npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_lut32_s32(const npy_int32 *table, npyv_u32 idx) +{ return npyv_reinterpret_s32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } +NPY_FINLINE npyv_f32 npyv_lut32_f32(const float *table, npyv_u32 idx) +{ return npyv_reinterpret_f32_u32(npyv_lut32_u32((const npy_uint32*)table, idx)); } + +// uses vector as indexes into a table +// that contains 16 elements of uint64. +NPY_FINLINE npyv_u64 npyv_lut16_u64(const npy_uint64 *table, npyv_u64 idx) +{ return __riscv_vloxei64_v_u64m1((const uint64_t*)table, __riscv_vmul(idx, sizeof(uint64_t), npyv_nlanes_u64), npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_lut16_s64(const npy_int64 *table, npyv_u64 idx) +{ return npyv_reinterpret_s64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } +NPY_FINLINE npyv_f64 npyv_lut16_f64(const double *table, npyv_u64 idx) +{ return npyv_reinterpret_f64_u64(npyv_lut16_u64((const npy_uint64*)table, idx)); } + +#endif // _NPY_SIMD_RVV_MEMORY_H diff --git a/numpy/_core/src/common/simd/rvv/misc.h b/numpy/_core/src/common/simd/rvv/misc.h new file mode 100644 index 000000000000..90b913df5c84 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/misc.h @@ -0,0 +1,369 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_MISC_H +#define _NPY_SIMD_RVV_MISC_H + +#include "conversion.h" + +// vector with zero lanes +#define npyv_zero_u8() __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s8() __riscv_vreinterpret_v_i32m1_i8m1(npyv_zero_s32()) +#define npyv_zero_u16() __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s16() __riscv_vreinterpret_v_i32m1_i16m1(npyv_zero_s32()) +#define npyv_zero_u32() __riscv_vmv_v_x_u32m1((uint32_t)0, npyv_nlanes_u32) +#define npyv_zero_s32() __riscv_vmv_v_x_i32m1((int32_t)0, npyv_nlanes_s32) +#define npyv_zero_u64() __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(npyv_zero_s32())) +#define npyv_zero_s64() __riscv_vreinterpret_v_i32m1_i64m1(npyv_zero_s32()) +#define npyv_zero_f32() __riscv_vfmv_v_f_f32m1(0.0f, npyv_nlanes_f32) +#define npyv_zero_f64() __riscv_vfmv_v_f_f64m1(0.0, npyv_nlanes_f64) + +// vector with a specific value set to all lanes +NPY_FINLINE npyv_u8 npyv_setall_u8(uint8_t val) +{ return __riscv_vmv_v_x_u8m1(val, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_setall_s8(int8_t val) +{ return __riscv_vmv_v_x_i8m1(val, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_setall_u16(uint16_t val) +{ return __riscv_vmv_v_x_u16m1(val, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_setall_s16(int16_t val) +{ return __riscv_vmv_v_x_i16m1(val, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_setall_u32(uint32_t val) +{ return __riscv_vmv_v_x_u32m1(val, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_setall_s32(int32_t val) +{ return __riscv_vmv_v_x_i32m1(val, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_setall_u64(uint64_t val) +{ return __riscv_vmv_v_x_u64m1(val, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_setall_s64(int64_t val) +{ return __riscv_vmv_v_x_i64m1(val, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_setall_f32(float val) +{ return __riscv_vfmv_v_f_f32m1(val, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_setall_f64(double val) +{ return __riscv_vfmv_v_f_f64m1(val, npyv_nlanes_f64); } + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv__set_u8(...) \ + ({ \ + const uint8_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u8] = { __VA_ARGS__ }; \ + __riscv_vle8_v_u8m1(v, npyv_nlanes_u8); \ + }) +#define npyv__set_s8(...) \ + ({ \ + const int8_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s8] = { __VA_ARGS__ }; \ + __riscv_vle8_v_i8m1(v, npyv_nlanes_s8); \ + }) +#define npyv__set_u16(...) \ + ({ \ + const uint16_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u16] = { __VA_ARGS__ }; \ + __riscv_vle16_v_u16m1(v, npyv_nlanes_u16); \ + }) +#define npyv__set_s16(...) \ + ({ \ + const int16_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s16] = { __VA_ARGS__ }; \ + __riscv_vle16_v_i16m1(v, npyv_nlanes_s16); \ + }) +#define npyv__set_u32(...) \ + ({ \ + const uint32_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_u32m1(v, npyv_nlanes_u32); \ + }) +#define npyv__set_s32(...) \ + ({ \ + const int32_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_i32m1(v, npyv_nlanes_s32); \ + }) +#define npyv__set_u64(...) \ + ({ \ + const uint64_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_u64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_u64m1(v, npyv_nlanes_u64); \ + }) +#define npyv__set_s64(...) \ + ({ \ + const int64_t NPY_DECL_ALIGNED(16) v[npyv_nlanes_s64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_i64m1(v, npyv_nlanes_s64); \ + }) +#define npyv__set_f32(...) \ + ({ \ + const float NPY_DECL_ALIGNED(16) v[npyv_nlanes_f32] = { __VA_ARGS__ }; \ + __riscv_vle32_v_f32m1(v, npyv_nlanes_f32); \ + }) +#define npyv__set_f64(...) \ + ({ \ + const double NPY_DECL_ALIGNED(16) v[npyv_nlanes_f64] = { __VA_ARGS__ }; \ + __riscv_vle64_v_f64m1(v, npyv_nlanes_f64); \ + }) + +#define npyv_setf_u8(FILL, ...) npyv__set_u8(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u8)(npy_uint8, FILL, __VA_ARGS__)) +#define npyv_setf_s8(FILL, ...) npyv__set_s8(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s8)(npy_int8, FILL, __VA_ARGS__)) +#define npyv_setf_u16(FILL, ...) npyv__set_u16(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u16)(npy_uint16, FILL, __VA_ARGS__)) +#define npyv_setf_s16(FILL, ...) npyv__set_s16(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s16)(npy_int16, FILL, __VA_ARGS__)) +#define npyv_setf_u32(FILL, ...) npyv__set_u32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u32)(npy_uint32, FILL, __VA_ARGS__)) +#define npyv_setf_s32(FILL, ...) npyv__set_s32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s32)(npy_int32, FILL, __VA_ARGS__)) +#define npyv_setf_u64(FILL, ...) npyv__set_u64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_u64)(npy_uint64, FILL, __VA_ARGS__)) +#define npyv_setf_s64(FILL, ...) npyv__set_s64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_s64)(npy_int64, FILL, __VA_ARGS__)) +#define npyv_setf_f32(FILL, ...) npyv__set_f32(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_f32)(float, FILL, __VA_ARGS__)) +#define npyv_setf_f64(FILL, ...) npyv__set_f64(NPY_CAT(NPYV__SET_FILL_, npyv_nlanes_f64)(double, FILL, __VA_ARGS__)) + +// vector with specific values set to each lane and +// set zero to all remained lanes +#define npyv_set_u8(...) npyv_setf_u8(0, __VA_ARGS__) +#define npyv_set_s8(...) npyv_setf_s8(0, __VA_ARGS__) +#define npyv_set_u16(...) npyv_setf_u16(0, __VA_ARGS__) +#define npyv_set_s16(...) npyv_setf_s16(0, __VA_ARGS__) +#define npyv_set_u32(...) npyv_setf_u32(0, __VA_ARGS__) +#define npyv_set_s32(...) npyv_setf_s32(0, __VA_ARGS__) +#define npyv_set_u64(...) npyv_setf_u64(0, __VA_ARGS__) +#define npyv_set_s64(...) npyv_setf_s64(0, __VA_ARGS__) +#define npyv_set_f32(...) npyv_setf_f32(0, __VA_ARGS__) +#define npyv_set_f64(...) npyv_setf_f64(0, __VA_ARGS__) + +// Per lane select +NPY_FINLINE npyv_u8 npyv_select_u8(npyv_b8 a, npyv_u8 b, npyv_u8 c) +{ return __riscv_vmerge_vvm_u8m1(c, b, npyv__from_b8(a), npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_select_s8(npyv_b8 a, npyv_s8 b, npyv_s8 c) +{ return __riscv_vmerge_vvm_i8m1(c, b, npyv__from_b8(a), npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_select_u16(npyv_b16 a, npyv_u16 b, npyv_u16 c) +{ return __riscv_vmerge_vvm_u16m1(c, b, npyv__from_b16(a), npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_select_s16(npyv_b16 a, npyv_s16 b, npyv_s16 c) +{ return __riscv_vmerge_vvm_i16m1(c, b, npyv__from_b16(a), npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_select_u32(npyv_b32 a, npyv_u32 b, npyv_u32 c) +{ return __riscv_vmerge_vvm_u32m1(c, b, npyv__from_b32(a), npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_select_s32(npyv_b32 a, npyv_s32 b, npyv_s32 c) +{ return __riscv_vmerge_vvm_i32m1(c, b, npyv__from_b32(a), npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_select_u64(npyv_b64 a, npyv_u64 b, npyv_u64 c) +{ return __riscv_vmerge_vvm_u64m1(c, b, npyv__from_b64(a), npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_select_s64(npyv_b64 a, npyv_s64 b, npyv_s64 c) +{ return __riscv_vmerge_vvm_i64m1(c, b, npyv__from_b64(a), npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_select_f32(npyv_b32 a, npyv_f32 b, npyv_f32 c) +{ return __riscv_vmerge_vvm_f32m1(c, b, npyv__from_b32(a), npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_select_f64(npyv_b64 a, npyv_f64 b, npyv_f64 c) +{ return __riscv_vmerge_vvm_f64m1(c, b, npyv__from_b64(a), npyv_nlanes_f64); } + +// extract the first vector's lane +NPY_FINLINE npy_uint8 npyv_extract0_u8(npyv_u8 a) +{ return __riscv_vmv_x_s_u8m1_u8(a); } +NPY_FINLINE npy_int8 npyv_extract0_s8(npyv_s8 a) +{ return __riscv_vmv_x_s_i8m1_i8(a); } +NPY_FINLINE npy_uint16 npyv_extract0_u16(npyv_u16 a) +{ return __riscv_vmv_x_s_u16m1_u16(a); } +NPY_FINLINE npy_int16 npyv_extract0_s16(npyv_s16 a) +{ return __riscv_vmv_x_s_i16m1_i16(a); } +NPY_FINLINE npy_uint32 npyv_extract0_u32(npyv_u32 a) +{ return __riscv_vmv_x_s_u32m1_u32(a); } +NPY_FINLINE npy_int32 npyv_extract0_s32(npyv_s32 a) +{ return __riscv_vmv_x_s_i32m1_i32(a); } +NPY_FINLINE npy_uint64 npyv_extract0_u64(npyv_u64 a) +{ return __riscv_vmv_x_s_u64m1_u64(a); } +NPY_FINLINE npy_int64 npyv_extract0_s64(npyv_s64 a) +{ return __riscv_vmv_x_s_i64m1_i64(a); } +NPY_FINLINE float npyv_extract0_f32(npyv_f32 a) +{ return __riscv_vfmv_f_s_f32m1_f32(a); } +NPY_FINLINE double npyv_extract0_f64(npyv_f64 a) +{ return __riscv_vfmv_f_s_f64m1_f64(a); } + +// Reinterpret +#define npyv_reinterpret_u8_u8(X) X +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i8m1_u8m1(a); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u16m1_u8m1(a); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_u16m1_u8m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u32m1_u8m1(a); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u64m1_u8m1(a); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u32m1_u8m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } +NPY_FINLINE npyv_u8 npyv_reinterpret_u8_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_u64m1_u8m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } + +#define npyv_reinterpret_s8_s8(X) X +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u8m1_i8m1(a); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_i16m1_i8m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i16m1_i8m1(a); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i32m1_i8m1(a); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i64m1_i8m1(a); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_i32m1_i8m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } +NPY_FINLINE npyv_s8 npyv_reinterpret_s8_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_i64m1_i8m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } + +#define npyv_reinterpret_u16_u16(X) X +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u8m1_u16m1(a); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_u8m1_u16m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i16m1_u16m1(a); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u32m1_u16m1(a); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u64m1_u16m1(a); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u32m1_u16m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } +NPY_FINLINE npyv_u16 npyv_reinterpret_u16_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_u64m1_u16m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } + +#define npyv_reinterpret_s16_s16(X) X +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_i8m1_i16m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i8m1_i16m1(a); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u16m1_i16m1(a); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i32m1_i16m1(a); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i64m1_i16m1(a); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_i32m1_i16m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } +NPY_FINLINE npyv_s16 npyv_reinterpret_s16_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_i64m1_i16m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } + +#define npyv_reinterpret_u32_u32(X) X +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u8m1_u32m1(a); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_u8m1_u32m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u16m1_u32m1(a); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_u16m1_u32m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i32m1_u32m1(a); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u64m1_u32m1(a); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_i64m1_u64m1(a)); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_f32m1_u32m1(a); } +NPY_FINLINE npyv_u32 npyv_reinterpret_u32_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a)); } + +#define npyv_reinterpret_s32_s32(X) X +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_i8m1_i32m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i8m1_i32m1(a); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_i16m1_i32m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i16m1_i32m1(a); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u32m1_i32m1(a); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_u64m1_i64m1(a)); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i64m1_i32m1(a); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_f32m1_i32m1(a); } +NPY_FINLINE npyv_s32 npyv_reinterpret_s32_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_i64m1_i32m1(__riscv_vreinterpret_v_f64m1_i64m1(a)); } + +#define npyv_reinterpret_u64_u64(X) X +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u8m1_u64m1(a); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_u8m1_u64m1(__riscv_vreinterpret_v_i8m1_u8m1(a)); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u16m1_u64m1(a); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_u16m1_u64m1(__riscv_vreinterpret_v_i16m1_u16m1(a)); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u32m1_u64m1(a); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_i32m1_u32m1(a)); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i64m1_u64m1(a); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a)); } +NPY_FINLINE npyv_u64 npyv_reinterpret_u64_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_f64m1_u64m1(a); } + +#define npyv_reinterpret_s64_s64(X) X +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_i8m1_i64m1(__riscv_vreinterpret_v_u8m1_i8m1(a)); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i8m1_i64m1(a); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_i16m1_i64m1(__riscv_vreinterpret_v_u16m1_i16m1(a)); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i16m1_i64m1(a); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_u32m1_i32m1(a)); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i32m1_i64m1(a); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u64m1_i64m1(a); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_i32m1_i64m1(__riscv_vreinterpret_v_f32m1_i32m1(a)); } +NPY_FINLINE npyv_s64 npyv_reinterpret_s64_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_f64m1_i64m1(a); } + +#define npyv_reinterpret_f32_f32(X) X +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u8m1_u32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i8m1_i32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u16m1_u32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i16m1_i32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(a); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i32m1_f32m1(a); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i32m1_f32m1(__riscv_vreinterpret_v_i64m1_i32m1(a)); } +NPY_FINLINE npyv_f32 npyv_reinterpret_f32_f64(npyv_f64 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(__riscv_vreinterpret_v_u64m1_u32m1(__riscv_vreinterpret_v_f64m1_u64m1(a))); } + +#define npyv_reinterpret_f64_f64(X) X +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u8(npyv_u8 a) +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u8m1_u64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i8m1_i64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u16(npyv_u16 a) +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u16m1_u64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i16m1_i64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u32(npyv_u32 a) +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_i64m1_f64m1(__riscv_vreinterpret_v_i32m1_i64m1(a)); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_u64(npyv_u64 a) +{ return __riscv_vreinterpret_v_u64m1_f64m1(a); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_s64(npyv_s64 a) +{ return __riscv_vreinterpret_v_i64m1_f64m1(a); } +NPY_FINLINE npyv_f64 npyv_reinterpret_f64_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u64m1_f64m1(__riscv_vreinterpret_v_u32m1_u64m1(__riscv_vreinterpret_v_f32m1_u32m1(a))); } + +// Only required by AVX2/AVX512 +#define npyv_cleanup() ((void)0) + +#endif // _NPY_SIMD_RVV_MISC_H diff --git a/numpy/_core/src/common/simd/rvv/operators.h b/numpy/_core/src/common/simd/rvv/operators.h new file mode 100644 index 000000000000..03b3e516b5fa --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/operators.h @@ -0,0 +1,430 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_OPERATORS_H +#define _NPY_SIMD_RVV_OPERATORS_H + +/*************************** + * Shifting + ***************************/ +// left +NPY_FINLINE npyv_u16 npyv_shl_u16(npyv_u16 a, int16_t c) +{ return __riscv_vsll_vx_u16m1(a, c, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_shl_s16(npyv_s16 a, int16_t c) +{ return __riscv_vsll_vx_i16m1(a, c, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_shl_u32(npyv_u32 a, int32_t c) +{ return __riscv_vsll_vx_u32m1(a, c, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_shl_s32(npyv_s32 a, int32_t c) +{ return __riscv_vsll_vx_i32m1(a, c, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_shl_u64(npyv_u64 a, int64_t c) +{ return __riscv_vsll_vx_u64m1(a, c, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_shl_s64(npyv_s64 a, int64_t c) +{ return __riscv_vsll_vx_i64m1(a, c, npyv_nlanes_s64); } + +// left by an immediate constant +NPY_FINLINE npyv_u16 npyv_shli_u16(npyv_u16 a, const int b) +{ return __riscv_vsll_vx_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_shli_s16(npyv_s16 a, const int b) +{ return __riscv_vsll_vx_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_shli_u32(npyv_u32 a, const int b) +{ return __riscv_vsll_vx_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_shli_s32(npyv_s32 a, const int b) +{ return __riscv_vsll_vx_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_shli_u64(npyv_u64 a, const int b) +{ return __riscv_vsll_vx_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_shli_s64(npyv_s64 a, const int b) +{ return __riscv_vsll_vx_i64m1(a, b, npyv_nlanes_s64); } + +// right +NPY_FINLINE npyv_u16 npyv_shr_u16(npyv_u16 a, int16_t c) +{ return __riscv_vsrl_vx_u16m1(a, c, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_shr_s16(npyv_s16 a, int16_t c) +{ return __riscv_vsra_vx_i16m1(a, c, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_shr_u32(npyv_u32 a, int32_t c) +{ return __riscv_vsrl_vx_u32m1(a, c, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_shr_s32(npyv_s32 a, int32_t c) +{ return __riscv_vsra_vx_i32m1(a, c, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_shr_u64(npyv_u64 a, int64_t c) +{ return __riscv_vsrl_vx_u64m1(a, c, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_shr_s64(npyv_s64 a, int64_t c) +{ return __riscv_vsra_vx_i64m1(a, c, npyv_nlanes_s64); } + +// right by an immediate constant +NPY_FINLINE npyv_u16 npyv_shri_u16(npyv_u16 a, const int b) +{ return __riscv_vsrl_vx_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_shri_s16(npyv_s16 a, const int b) +{ return __riscv_vsra_vx_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_shri_u32(npyv_u32 a, const int b) +{ return __riscv_vsrl_vx_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_shri_s32(npyv_s32 a, const int b) +{ return __riscv_vsra_vx_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_shri_u64(npyv_u64 a, const int b) +{ return __riscv_vsrl_vx_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_shri_s64(npyv_s64 a, const int b) +{ return __riscv_vsra_vx_i64m1(a, b, npyv_nlanes_s64); } + +/*************************** + * Logical + ***************************/ +// AND +NPY_FINLINE npyv_u8 npyv_and_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vand_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_and_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vand_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_and_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vand_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_and_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vand_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_and_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vand_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_and_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vand_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_and_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vand_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_and_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vand_vv_i64m1(a, b, npyv_nlanes_s64); } + +NPY_FINLINE npyv_f32 npyv_and_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vand_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + npyv_nlanes_f32 + ) + ); +} +NPY_FINLINE npyv_f64 npyv_and_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vand_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + npyv_nlanes_f64 + ) + ); +} + +#define npyv_and_b8 npyv_and_u8 +#define npyv_and_b16 npyv_and_u16 +#define npyv_and_b32 npyv_and_u32 +#define npyv_and_b64 npyv_and_u64 + +// OR +NPY_FINLINE npyv_u8 npyv_or_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vor_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_or_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vor_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_or_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vor_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_or_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vor_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_or_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vor_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_or_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vor_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_or_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vor_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_or_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vor_vv_i64m1(a, b, npyv_nlanes_s64); } + +NPY_FINLINE npyv_f32 npyv_or_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + npyv_nlanes_f32 + ) + ); +} +NPY_FINLINE npyv_f64 npyv_or_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + npyv_nlanes_f64 + ) + ); +} + +#define npyv_or_b8 npyv_or_u8 +#define npyv_or_b16 npyv_or_u16 +#define npyv_or_b32 npyv_or_u32 +#define npyv_or_b64 npyv_or_u64 + +// XOR +NPY_FINLINE npyv_u8 npyv_xor_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vxor_vv_u8m1(a, b, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_xor_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vxor_vv_i8m1(a, b, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_xor_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vxor_vv_u16m1(a, b, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_xor_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vxor_vv_i16m1(a, b, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_xor_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vxor_vv_u32m1(a, b, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_xor_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vxor_vv_i32m1(a, b, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_xor_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vxor_vv_u64m1(a, b, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_xor_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vxor_vv_i64m1(a, b, npyv_nlanes_s64); } + +NPY_FINLINE npyv_f32 npyv_xor_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vxor_vv_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + __riscv_vreinterpret_v_f32m1_u32m1(b), + npyv_nlanes_f32 + ) + ); +} +NPY_FINLINE npyv_f64 npyv_xor_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vxor_vv_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + __riscv_vreinterpret_v_f64m1_u64m1(b), + npyv_nlanes_f64 + ) + ); +} + +#define npyv_xor_b8 npyv_xor_u8 +#define npyv_xor_b16 npyv_xor_u16 +#define npyv_xor_b32 npyv_xor_u32 +#define npyv_xor_b64 npyv_xor_u64 + +// NOT +NPY_FINLINE npyv_u8 npyv_not_u8(npyv_u8 a) +{ return __riscv_vnot_v_u8m1(a, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_not_s8(npyv_s8 a) +{ return __riscv_vnot_v_i8m1(a, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_not_u16(npyv_u16 a) +{ return __riscv_vnot_v_u16m1(a, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_not_s16(npyv_s16 a) +{ return __riscv_vnot_v_i16m1(a, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_not_u32(npyv_u32 a) +{ return __riscv_vnot_v_u32m1(a, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_not_s32(npyv_s32 a) +{ return __riscv_vnot_v_i32m1(a, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_not_u64(npyv_u64 a) +{ return __riscv_vnot_v_u64m1(a, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_not_s64(npyv_s64 a) +{ return __riscv_vnot_v_i64m1(a, npyv_nlanes_s64); } + +NPY_FINLINE npyv_f32 npyv_not_f32(npyv_f32 a) +{ + return __riscv_vreinterpret_v_u32m1_f32m1( + __riscv_vnot_v_u32m1( + __riscv_vreinterpret_v_f32m1_u32m1(a), + npyv_nlanes_f32 + ) + ); +} +NPY_FINLINE npyv_f64 npyv_not_f64(npyv_f64 a) +{ + return __riscv_vreinterpret_v_u64m1_f64m1( + __riscv_vnot_v_u64m1( + __riscv_vreinterpret_v_f64m1_u64m1(a), + npyv_nlanes_f64 + ) + ); +} + +#define npyv_not_b8 npyv_not_u8 +#define npyv_not_b16 npyv_not_u16 +#define npyv_not_b32 npyv_not_u32 +#define npyv_not_b64 npyv_not_u64 + +// ANDC, ORC and XNOR +NPY_FINLINE npyv_u8 npyv_andc_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vand_vv_u8m1(a, __riscv_vnot_v_u8m1(b, npyv_nlanes_u8), npyv_nlanes_u8); } + +#define npyv_andc_b8 npyv_andc_u8 +NPY_FINLINE npyv_b8 npyv_orc_b8(npyv_b8 a, npyv_b8 b) +{ return __riscv_vor_vv_u8m1(a, __riscv_vnot_v_u8m1(b, npyv_nlanes_u8), npyv_nlanes_u8); } +NPY_FINLINE npyv_b8 npyv_xnor_b8(npyv_b8 a, npyv_b8 b) +{ return __riscv_vnot_v_u8m1(__riscv_vxor_vv_u8m1(a, b, npyv_nlanes_u8), npyv_nlanes_u8); } + +/*************************** + * Comparison + ***************************/ +// equal +NPY_FINLINE npyv_b8 npyv_cmpeq_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmseq_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpeq_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmseq_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpeq_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmseq_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpeq_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmseq_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmseq_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmseq_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmseq_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmseq_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpeq_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfeq_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpeq_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfeq_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } + +// not Equal +NPY_FINLINE npyv_b8 npyv_cmpneq_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsne_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpneq_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsne_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpneq_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsne_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpneq_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsne_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsne_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsne_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsne_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsne_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpneq_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfne_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpneq_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfne_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } + +// greater than +NPY_FINLINE npyv_b8 npyv_cmpgt_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsgtu_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpgt_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsgt_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpgt_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsgtu_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpgt_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsgt_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsgtu_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsgt_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsgtu_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsgt_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpgt_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfgt_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpgt_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfgt_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } + +// greater than or equal +NPY_FINLINE npyv_b8 npyv_cmpge_u8(npyv_u8 a, npyv_u8 b) +{ return npyv__to_b8(__riscv_vmsgeu_vv_u8m1_b8(a, b, npyv_nlanes_u8)); } +NPY_FINLINE npyv_b8 npyv_cmpge_s8(npyv_s8 a, npyv_s8 b) +{ return npyv__to_b8(__riscv_vmsge_vv_i8m1_b8(a, b, npyv_nlanes_s8)); } +NPY_FINLINE npyv_b16 npyv_cmpge_u16(npyv_u16 a, npyv_u16 b) +{ return npyv__to_b16(__riscv_vmsgeu_vv_u16m1_b16(a, b, npyv_nlanes_u16)); } +NPY_FINLINE npyv_b16 npyv_cmpge_s16(npyv_s16 a, npyv_s16 b) +{ return npyv__to_b16(__riscv_vmsge_vv_i16m1_b16(a, b, npyv_nlanes_s16)); } +NPY_FINLINE npyv_b32 npyv_cmpge_u32(npyv_u32 a, npyv_u32 b) +{ return npyv__to_b32(__riscv_vmsgeu_vv_u32m1_b32(a, b, npyv_nlanes_u32)); } +NPY_FINLINE npyv_b32 npyv_cmpge_s32(npyv_s32 a, npyv_s32 b) +{ return npyv__to_b32(__riscv_vmsge_vv_i32m1_b32(a, b, npyv_nlanes_s32)); } +NPY_FINLINE npyv_b64 npyv_cmpge_u64(npyv_u64 a, npyv_u64 b) +{ return npyv__to_b64(__riscv_vmsgeu_vv_u64m1_b64(a, b, npyv_nlanes_u64)); } +NPY_FINLINE npyv_b64 npyv_cmpge_s64(npyv_s64 a, npyv_s64 b) +{ return npyv__to_b64(__riscv_vmsge_vv_i64m1_b64(a, b, npyv_nlanes_s64)); } +NPY_FINLINE npyv_b32 npyv_cmpge_f32(npyv_f32 a, npyv_f32 b) +{ return npyv__to_b32(__riscv_vmfge_vv_f32m1_b32(a, b, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_cmpge_f64(npyv_f64 a, npyv_f64 b) +{ return npyv__to_b64(__riscv_vmfge_vv_f64m1_b64(a, b, npyv_nlanes_f64)); } + +// less than +#define npyv_cmplt_u8(A, B) npyv_cmpgt_u8(B, A) +#define npyv_cmplt_s8(A, B) npyv_cmpgt_s8(B, A) +#define npyv_cmplt_u16(A, B) npyv_cmpgt_u16(B, A) +#define npyv_cmplt_s16(A, B) npyv_cmpgt_s16(B, A) +#define npyv_cmplt_u32(A, B) npyv_cmpgt_u32(B, A) +#define npyv_cmplt_s32(A, B) npyv_cmpgt_s32(B, A) +#define npyv_cmplt_u64(A, B) npyv_cmpgt_u64(B, A) +#define npyv_cmplt_s64(A, B) npyv_cmpgt_s64(B, A) +#define npyv_cmplt_f32(A, B) npyv_cmpgt_f32(B, A) +#define npyv_cmplt_f64(A, B) npyv_cmpgt_f64(B, A) + +// less than or equal +#define npyv_cmple_u8(A, B) npyv_cmpge_u8(B, A) +#define npyv_cmple_s8(A, B) npyv_cmpge_s8(B, A) +#define npyv_cmple_u16(A, B) npyv_cmpge_u16(B, A) +#define npyv_cmple_s16(A, B) npyv_cmpge_s16(B, A) +#define npyv_cmple_u32(A, B) npyv_cmpge_u32(B, A) +#define npyv_cmple_s32(A, B) npyv_cmpge_s32(B, A) +#define npyv_cmple_u64(A, B) npyv_cmpge_u64(B, A) +#define npyv_cmple_s64(A, B) npyv_cmpge_s64(B, A) +#define npyv_cmple_f32(A, B) npyv_cmpge_f32(B, A) +#define npyv_cmple_f64(A, B) npyv_cmpge_f64(B, A) + +// check special cases +NPY_FINLINE npyv_b32 npyv_notnan_f32(npyv_f32 a) +{ return npyv__to_b32(__riscv_vmfeq_vv_f32m1_b32(a, a, npyv_nlanes_f32)); } +NPY_FINLINE npyv_b64 npyv_notnan_f64(npyv_f64 a) +{ return npyv__to_b64(__riscv_vmfeq_vv_f64m1_b64(a, a, npyv_nlanes_f64)); } + +// Test cross all vector lanes +// any: returns true if any of the elements is not equal to zero +// all: returns true if all elements are not equal to zero +NPY_FINLINE bool npyv_any_u8(npyv_u8 a) +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u8), npyv_nlanes_u8) != -1; } +NPY_FINLINE bool npyv_all_u8(npyv_u8 a) +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u8), npyv_nlanes_u8) == -1; } +NPY_FINLINE bool npyv_any_u16(npyv_u16 a) +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u16), npyv_nlanes_u16) != -1; } +NPY_FINLINE bool npyv_all_u16(npyv_u16 a) +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u16), npyv_nlanes_u16) == -1; } +NPY_FINLINE bool npyv_any_u32(npyv_u32 a) +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u32), npyv_nlanes_u32) != -1; } +NPY_FINLINE bool npyv_all_u32(npyv_u32 a) +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u32), npyv_nlanes_u32) == -1; } +NPY_FINLINE bool npyv_any_u64(npyv_u64 a) +{ return __riscv_vfirst(__riscv_vmsne(a, 0, npyv_nlanes_u64), npyv_nlanes_u64) != -1; } +NPY_FINLINE bool npyv_all_u64(npyv_u64 a) +{ return __riscv_vfirst(__riscv_vmseq(a, 0, npyv_nlanes_u64), npyv_nlanes_u64) == -1; } + +#define npyv_any_b8 npyv_any_u8 +#define npyv_all_b8 npyv_all_u8 +#define npyv_any_b16 npyv_any_u16 +#define npyv_all_b16 npyv_all_u16 +#define npyv_any_b32 npyv_any_u32 +#define npyv_all_b32 npyv_all_u32 +#define npyv_any_b64 npyv_any_u64 +#define npyv_all_b64 npyv_all_u64 + +NPY_FINLINE bool npyv_any_s8(npyv_s8 a) +{ return npyv_any_u8(npyv_reinterpret_u8_s8(a)); } +NPY_FINLINE bool npyv_all_s8(npyv_s8 a) +{ return npyv_all_u8(npyv_reinterpret_u8_s8(a)); } +NPY_FINLINE bool npyv_any_s16(npyv_s16 a) +{ return npyv_any_u16(npyv_reinterpret_u16_s16(a)); } +NPY_FINLINE bool npyv_all_s16(npyv_s16 a) +{ return npyv_all_u16(npyv_reinterpret_u16_s16(a)); } +NPY_FINLINE bool npyv_any_s32(npyv_s32 a) +{ return npyv_any_u32(npyv_reinterpret_u32_s32(a)); } +NPY_FINLINE bool npyv_all_s32(npyv_s32 a) +{ return npyv_all_u32(npyv_reinterpret_u32_s32(a)); } +NPY_FINLINE bool npyv_any_s64(npyv_s64 a) +{ return npyv_any_u64(npyv_reinterpret_u64_s64(a)); } +NPY_FINLINE bool npyv_all_s64(npyv_s64 a) +{ return npyv_all_u64(npyv_reinterpret_u64_s64(a)); } + +NPY_FINLINE bool npyv_any_f32(npyv_f32 a) +{ return npyv_any_u32(npyv_reinterpret_u32_f32(__riscv_vfabs(a, npyv_nlanes_f32))); } +NPY_FINLINE bool npyv_all_f32(npyv_f32 a) +{ return npyv_all_u32(npyv_reinterpret_u32_f32(__riscv_vfabs(a, npyv_nlanes_f32))); } +NPY_FINLINE bool npyv_any_f64(npyv_f64 a) +{ return npyv_any_u64(npyv_reinterpret_u64_f64(__riscv_vfabs(a, npyv_nlanes_f64))); } +NPY_FINLINE bool npyv_all_f64(npyv_f64 a) +{ return npyv_all_u64(npyv_reinterpret_u64_f64(__riscv_vfabs(a, npyv_nlanes_f64))); } + +#endif // _NPY_SIMD_RVV_OPERATORS_H diff --git a/numpy/_core/src/common/simd/rvv/reorder.h b/numpy/_core/src/common/simd/rvv/reorder.h new file mode 100644 index 000000000000..28a146c3deb0 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/reorder.h @@ -0,0 +1,268 @@ +#ifndef NPY_SIMD + #error "Not a standalone header" +#endif + +#ifndef _NPY_SIMD_RVV_REORDER_H +#define _NPY_SIMD_RVV_REORDER_H + +// combine lower part of two vectors +NPY_FINLINE npyv_u8 npyv_combinel_u8(npyv_u8 a, npyv_u8 b) +{ return __riscv_vslideup_vx_u8m1(a, b, npyv_nlanes_u8 / 2, npyv_nlanes_u8); } +NPY_FINLINE npyv_s8 npyv_combinel_s8(npyv_s8 a, npyv_s8 b) +{ return __riscv_vslideup_vx_i8m1(a, b, npyv_nlanes_s8 / 2, npyv_nlanes_s8); } +NPY_FINLINE npyv_u16 npyv_combinel_u16(npyv_u16 a, npyv_u16 b) +{ return __riscv_vslideup_vx_u16m1(a, b, npyv_nlanes_u16 / 2, npyv_nlanes_u16); } +NPY_FINLINE npyv_s16 npyv_combinel_s16(npyv_s16 a, npyv_s16 b) +{ return __riscv_vslideup_vx_i16m1(a, b, npyv_nlanes_s16 / 2, npyv_nlanes_s16); } +NPY_FINLINE npyv_u32 npyv_combinel_u32(npyv_u32 a, npyv_u32 b) +{ return __riscv_vslideup_vx_u32m1(a, b, npyv_nlanes_u32 / 2, npyv_nlanes_u32); } +NPY_FINLINE npyv_s32 npyv_combinel_s32(npyv_s32 a, npyv_s32 b) +{ return __riscv_vslideup_vx_i32m1(a, b, npyv_nlanes_s32 / 2, npyv_nlanes_s32); } +NPY_FINLINE npyv_u64 npyv_combinel_u64(npyv_u64 a, npyv_u64 b) +{ return __riscv_vslideup_vx_u64m1(a, b, npyv_nlanes_u64 / 2, npyv_nlanes_u64); } +NPY_FINLINE npyv_s64 npyv_combinel_s64(npyv_s64 a, npyv_s64 b) +{ return __riscv_vslideup_vx_i64m1(a, b, npyv_nlanes_s64 / 2, npyv_nlanes_s64); } +NPY_FINLINE npyv_f32 npyv_combinel_f32(npyv_f32 a, npyv_f32 b) +{ return __riscv_vslideup_vx_f32m1(a, b, npyv_nlanes_f32 / 2, npyv_nlanes_f32); } +NPY_FINLINE npyv_f64 npyv_combinel_f64(npyv_f64 a, npyv_f64 b) +{ return __riscv_vslideup_vx_f64m1(a, b, npyv_nlanes_f64 / 2, npyv_nlanes_f64); } + +// combine higher part of two vectors +NPY_FINLINE npyv_u8 npyv_combineh_u8(npyv_u8 a, npyv_u8 b) +{ + return __riscv_vslideup_vx_u8m1( + __riscv_vslidedown_vx_u8m1(a, npyv_nlanes_u8 / 2, npyv_nlanes_u8), + __riscv_vslidedown_vx_u8m1(b, npyv_nlanes_u8 / 2, npyv_nlanes_u8), + npyv_nlanes_u8 / 2, + npyv_nlanes_u8 + ); +} + +NPY_FINLINE npyv_u16 npyv_combineh_u16(npyv_u16 a, npyv_u16 b) +{ + return __riscv_vslideup_vx_u16m1( + __riscv_vslidedown_vx_u16m1(a, npyv_nlanes_u16 / 2, npyv_nlanes_u16), + __riscv_vslidedown_vx_u16m1(b, npyv_nlanes_u16 / 2, npyv_nlanes_u16), + npyv_nlanes_u16 / 2, + npyv_nlanes_u16 + ); +} + +NPY_FINLINE npyv_u32 npyv_combineh_u32(npyv_u32 a, npyv_u32 b) +{ + return __riscv_vslideup_vx_u32m1( + __riscv_vslidedown_vx_u32m1(a, npyv_nlanes_u32 / 2, npyv_nlanes_u32), + __riscv_vslidedown_vx_u32m1(b, npyv_nlanes_u32 / 2, npyv_nlanes_u32), + npyv_nlanes_u32 / 2, + npyv_nlanes_u32 + ); +} + +NPY_FINLINE npyv_u64 npyv_combineh_u64(npyv_u64 a, npyv_u64 b) +{ + return __riscv_vslideup_vx_u64m1( + __riscv_vslidedown_vx_u64m1(a, npyv_nlanes_u64 / 2, npyv_nlanes_u64), + __riscv_vslidedown_vx_u64m1(b, npyv_nlanes_u64 / 2, npyv_nlanes_u64), + npyv_nlanes_u64 / 2, + npyv_nlanes_u64 + ); +} + +NPY_FINLINE npyv_s8 npyv_combineh_s8(npyv_s8 a, npyv_s8 b) +{ + return __riscv_vslideup_vx_i8m1( + __riscv_vslidedown_vx_i8m1(a, npyv_nlanes_s8 / 2, npyv_nlanes_s8), + __riscv_vslidedown_vx_i8m1(b, npyv_nlanes_s8 / 2, npyv_nlanes_s8), + npyv_nlanes_s8 / 2, + npyv_nlanes_s8 + ); +} + +NPY_FINLINE npyv_s16 npyv_combineh_s16(npyv_s16 a, npyv_s16 b) +{ + return __riscv_vslideup_vx_i16m1( + __riscv_vslidedown_vx_i16m1(a, npyv_nlanes_s16 / 2, npyv_nlanes_s16), + __riscv_vslidedown_vx_i16m1(b, npyv_nlanes_s16 / 2, npyv_nlanes_s16), + npyv_nlanes_s16 / 2, + npyv_nlanes_s16 + ); +} + +NPY_FINLINE npyv_s32 npyv_combineh_s32(npyv_s32 a, npyv_s32 b) +{ + return __riscv_vslideup_vx_i32m1( + __riscv_vslidedown_vx_i32m1(a, npyv_nlanes_s32 / 2, npyv_nlanes_s32), + __riscv_vslidedown_vx_i32m1(b, npyv_nlanes_s32 / 2, npyv_nlanes_s32), + npyv_nlanes_s32 / 2, + npyv_nlanes_s32 + ); +} + +NPY_FINLINE npyv_s64 npyv_combineh_s64(npyv_s64 a, npyv_s64 b) +{ + return __riscv_vslideup_vx_i64m1( + __riscv_vslidedown_vx_i64m1(a, npyv_nlanes_s64 / 2, npyv_nlanes_s64), + __riscv_vslidedown_vx_i64m1(b, npyv_nlanes_s64 / 2, npyv_nlanes_s64), + npyv_nlanes_s64 / 2, + npyv_nlanes_s64 + ); +} + +NPY_FINLINE npyv_f32 npyv_combineh_f32(npyv_f32 a, npyv_f32 b) +{ + return __riscv_vslideup_vx_f32m1( + __riscv_vslidedown_vx_f32m1(a, npyv_nlanes_f32 / 2, npyv_nlanes_f32), + __riscv_vslidedown_vx_f32m1(b, npyv_nlanes_f32 / 2, npyv_nlanes_f32), + npyv_nlanes_f32 / 2, + npyv_nlanes_f32 + ); +} + +NPY_FINLINE npyv_f64 npyv_combineh_f64(npyv_f64 a, npyv_f64 b) +{ + return __riscv_vslideup_vx_f64m1( + __riscv_vslidedown_vx_f64m1(a, npyv_nlanes_f64 / 2, npyv_nlanes_f64), + __riscv_vslidedown_vx_f64m1(b, npyv_nlanes_f64 / 2, npyv_nlanes_f64), + npyv_nlanes_f64 / 2, + npyv_nlanes_f64 + ); +} + +// combine two vectors from lower and higher parts of two other vectors +#define NPYV_IMPL_RVV_COMBINE(T_VEC, SFX) \ + NPY_FINLINE T_VEC##x2 npyv_combine_##SFX(T_VEC a, T_VEC b) \ + { \ + return (T_VEC##x2){{ \ + npyv_combinel_##SFX(a, b), \ + npyv_combineh_##SFX(a, b) \ + }}; \ + } + +NPYV_IMPL_RVV_COMBINE(npyv_u8, u8) +NPYV_IMPL_RVV_COMBINE(npyv_s8, s8) +NPYV_IMPL_RVV_COMBINE(npyv_u16, u16) +NPYV_IMPL_RVV_COMBINE(npyv_s16, s16) +NPYV_IMPL_RVV_COMBINE(npyv_u32, u32) +NPYV_IMPL_RVV_COMBINE(npyv_s32, s32) +NPYV_IMPL_RVV_COMBINE(npyv_u64, u64) +NPYV_IMPL_RVV_COMBINE(npyv_s64, s64) +NPYV_IMPL_RVV_COMBINE(npyv_f32, f32) +NPYV_IMPL_RVV_COMBINE(npyv_f64, f64) +#undef NPYV_IMPL_RVV_COMBINE + +// interleave & deinterleave two vectors +#define NPYV_IMPL_RVV_ZIP(T_VEC, SFX, R_SFX, EEW) \ + NPY_FINLINE T_VEC##x2 npyv_zip_##SFX(T_VEC a, T_VEC b) \ + { \ + const int vl = npyv_nlanes_##SFX; \ + npyv_lanetype_##SFX v[vl * 2]; \ + __riscv_vsseg2##EEW( \ + v, __riscv_vcreate_v_##R_SFX##m1x2(a, b), vl \ + ); \ + return (T_VEC##x2){{ \ + __riscv_vl##EEW##_v_##R_SFX##m1(v , vl), \ + __riscv_vl##EEW##_v_##R_SFX##m1(v + vl, vl) \ + }}; \ + } \ + NPY_FINLINE T_VEC##x2 npyv_unzip_##SFX(T_VEC a, T_VEC b) \ + { \ + const int vl = npyv_nlanes_##SFX; \ + npyv_lanetype_##SFX v[vl * 2]; \ + __riscv_vs##EEW(v , a, vl); \ + __riscv_vs##EEW(v + vl, b, vl); \ + npyv__##SFX##x2 d = \ + __riscv_vlseg2##EEW##_v_##R_SFX##m1x2(v, vl); \ + return (T_VEC##x2){{ \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(d, 0), \ + __riscv_vget_v_##R_SFX##m1x2_##R_SFX##m1(d, 1) \ + }}; \ + } + +NPYV_IMPL_RVV_ZIP(npyv_u8, u8, u8, e8) +NPYV_IMPL_RVV_ZIP(npyv_s8, s8, i8, e8) +NPYV_IMPL_RVV_ZIP(npyv_u16, u16, u16, e16) +NPYV_IMPL_RVV_ZIP(npyv_s16, s16, i16, e16) +NPYV_IMPL_RVV_ZIP(npyv_u32, u32, u32, e32) +NPYV_IMPL_RVV_ZIP(npyv_s32, s32, i32, e32) +NPYV_IMPL_RVV_ZIP(npyv_u64, u64, u64, e64) +NPYV_IMPL_RVV_ZIP(npyv_s64, s64, i64, e64) +NPYV_IMPL_RVV_ZIP(npyv_f32, f32, f32, e32) +NPYV_IMPL_RVV_ZIP(npyv_f64, f64, f64, e64) +#undef NPYV_IMPL_RVV_ZIP + +// Reverse elements of each 64-bit lane +NPY_FINLINE npyv_u8 npyv_rev64_u8(npyv_u8 a) +{ + vuint8m1_t vid = __riscv_vid_v_u8m1(npyv_nlanes_u8); + vuint8m1_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 3, npyv_nlanes_u8), 4, npyv_nlanes_u8), 7, npyv_nlanes_u8); + vuint8m1_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u8); + return __riscv_vrgather(a, idxs, npyv_nlanes_u8); +} +NPY_FINLINE npyv_s8 npyv_rev64_s8(npyv_s8 a) +{ return __riscv_vreinterpret_v_u8m1_i8m1(npyv_rev64_u8(__riscv_vreinterpret_v_i8m1_u8m1(a))); } + +NPY_FINLINE npyv_u16 npyv_rev64_u16(npyv_u16 a) +{ + vuint16m1_t vid = __riscv_vid_v_u16m1(npyv_nlanes_u16); + vuint16m1_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 2, npyv_nlanes_u16), 3, npyv_nlanes_u16), 3, npyv_nlanes_u16); + vuint16m1_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u16); + return __riscv_vrgather(a, idxs, npyv_nlanes_u16); +} +NPY_FINLINE npyv_s16 npyv_rev64_s16(npyv_s16 a) +{ return __riscv_vreinterpret_v_u16m1_i16m1(npyv_rev64_u16(__riscv_vreinterpret_v_i16m1_u16m1(a))); } + +NPY_FINLINE npyv_u32 npyv_rev64_u32(npyv_u32 a) +{ + vuint16mf2_t vid = __riscv_vid_v_u16mf2(npyv_nlanes_u16 / 2); + vuint16mf2_t sub = __riscv_vadd(__riscv_vsll(__riscv_vsrl(vid, 1, npyv_nlanes_u16 / 2), 2, npyv_nlanes_u16 / 2), 1, npyv_nlanes_u16 / 2); + vuint16mf2_t idxs = __riscv_vsub(sub, vid, npyv_nlanes_u16 / 2); + return __riscv_vrgatherei16(a, idxs, npyv_nlanes_u32); +} +NPY_FINLINE npyv_s32 npyv_rev64_s32(npyv_s32 a) +{ return __riscv_vreinterpret_v_u32m1_i32m1(npyv_rev64_u32(__riscv_vreinterpret_v_i32m1_u32m1(a))); } +NPY_FINLINE npyv_f32 npyv_rev64_f32(npyv_f32 a) +{ return __riscv_vreinterpret_v_u32m1_f32m1(npyv_rev64_u32(__riscv_vreinterpret_v_f32m1_u32m1(a))); } + +// Permuting the elements of each 128-bit lane by immediate index for +// each element. +#define npyv_permi128_u32(A, E0, E1, E2, E3) \ + ({ \ + const uint16_t v[] = { \ + E0 , E1 , E2 , E3 , \ + E0 + 4, E1 + 4, E2 + 4, E3 + 4, \ + E0 + 8, E1 + 8, E2 + 8, E3 + 8, \ + E0 + 12, E1 + 12, E2 + 12, E3 + 12, \ + E0 + 16, E1 + 16, E2 + 16, E3 + 16, \ + E0 + 20, E1 + 20, E2 + 20, E3 + 20, \ + E0 + 24, E1 + 24, E2 + 24, E3 + 24, \ + E0 + 28, E1 + 28, E2 + 28, E3 + 28 \ + }; \ + __riscv_vrgatherei16( \ + A, __riscv_vle16_v_u16mf2(v, npyv_nlanes_u32), \ + npyv_nlanes_u32 \ + ); \ + }) +#define npyv_permi128_s32(A, E0, E1, E2, E3) __riscv_vreinterpret_v_u32m1_i32m1(npyv_permi128_u32(__riscv_vreinterpret_v_i32m1_u32m1(A), E0, E1, E2, E3)) +#define npyv_permi128_f32(A, E0, E1, E2, E3) __riscv_vreinterpret_v_u32m1_f32m1(npyv_permi128_u32(__riscv_vreinterpret_v_f32m1_u32m1(A), E0, E1, E2, E3)) + +#define npyv_permi128_u64(A, E0, E1) \ + ({ \ + const uint16_t v[] = { \ + E0 , E1 , \ + E0 + 2, E1 + 2, \ + E0 + 4, E1 + 4, \ + E0 + 6, E1 + 6, \ + E0 + 8, E1 + 8, \ + E0 + 10, E1 + 10, \ + E0 + 12, E1 + 12, \ + E0 + 14, E1 + 14 \ + }; \ + __riscv_vrgatherei16( \ + A, __riscv_vle16_v_u16mf4(v, npyv_nlanes_u64), \ + npyv_nlanes_u64 \ + ); \ + }) +#define npyv_permi128_s64(A, E0, E1) __riscv_vreinterpret_v_u64m1_i64m1(npyv_permi128_u64(__riscv_vreinterpret_v_i64m1_u64m1(A), E0, E1)) +#define npyv_permi128_f64(A, E0, E1) __riscv_vreinterpret_v_u64m1_f64m1(npyv_permi128_u64(__riscv_vreinterpret_v_f64m1_u64m1(A), E0, E1)) + +#endif // _NPY_SIMD_RVV_REORDER_H diff --git a/numpy/_core/src/common/simd/rvv/rvv.h b/numpy/_core/src/common/simd/rvv/rvv.h new file mode 100644 index 000000000000..f798976da8a1 --- /dev/null +++ b/numpy/_core/src/common/simd/rvv/rvv.h @@ -0,0 +1,106 @@ +#ifndef _NPY_SIMD_H_ + #error "Not a standalone header" +#endif + +#include + +// supports VLEN 128, 256 and 512 +// it is impossible to implement npyv_tobits_b8 when VLEN>512 +#define NPY_SIMD __riscv_v_fixed_vlen +#define NPY_SIMD_WIDTH (__riscv_v_fixed_vlen / 8) +#define NPY_SIMD_F32 1 +#define NPY_SIMD_F64 1 + +#ifdef NPY_HAVE_FMA3 + #define NPY_SIMD_FMA3 1 // native support +#else + #define NPY_SIMD_FMA3 0 // fast emulated +#endif + +#define NPY_SIMD_BIGENDIAN 0 +#define NPY_SIMD_CMPSIGNAL 1 + +typedef vuint8m1_t fixed_vuint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint16m1_t fixed_vuint16m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint32m1_t fixed_vuint32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vuint64m1_t fixed_vuint64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint8m1_t fixed_vint8m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint16m1_t fixed_vint16m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint32m1_t fixed_vint32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vint64m1_t fixed_vint64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vfloat32m1_t fixed_vfloat32m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); +typedef vfloat64m1_t fixed_vfloat64m1_t __attribute__((riscv_rvv_vector_bits(__riscv_v_fixed_vlen))); + +#define npyv_u8 fixed_vuint8m1_t +#define npyv_u16 fixed_vuint16m1_t +#define npyv_u32 fixed_vuint32m1_t +#define npyv_u64 fixed_vuint64m1_t +#define npyv_s8 fixed_vint8m1_t +#define npyv_s16 fixed_vint16m1_t +#define npyv_s32 fixed_vint32m1_t +#define npyv_s64 fixed_vint64m1_t +#define npyv_f32 fixed_vfloat32m1_t +#define npyv_f64 fixed_vfloat64m1_t + +// simulate bool as uint due to gcc/clang bugs, change to fixed_vbool if possible +#define npyv_b8 fixed_vuint8m1_t +#define npyv_b16 fixed_vuint16m1_t +#define npyv_b32 fixed_vuint32m1_t +#define npyv_b64 fixed_vuint64m1_t + + +typedef struct { fixed_vuint8m1_t val[2]; } npyv_u8x2; +typedef struct { fixed_vint8m1_t val[2]; } npyv_s8x2; +typedef struct { fixed_vuint16m1_t val[2]; } npyv_u16x2; +typedef struct { fixed_vint16m1_t val[2]; } npyv_s16x2; +typedef struct { fixed_vuint32m1_t val[2]; } npyv_u32x2; +typedef struct { fixed_vint32m1_t val[2]; } npyv_s32x2; +typedef struct { fixed_vuint64m1_t val[2]; } npyv_u64x2; +typedef struct { fixed_vint64m1_t val[2]; } npyv_s64x2; +typedef struct { fixed_vfloat32m1_t val[2]; } npyv_f32x2; +typedef struct { fixed_vfloat64m1_t val[2]; } npyv_f64x2; + + +typedef struct { fixed_vuint8m1_t val[3]; } npyv_u8x3; +typedef struct { fixed_vint8m1_t val[3]; } npyv_s8x3; +typedef struct { fixed_vuint16m1_t val[3]; } npyv_u16x3; +typedef struct { fixed_vint16m1_t val[3]; } npyv_s16x3; +typedef struct { fixed_vuint32m1_t val[3]; } npyv_u32x3; +typedef struct { fixed_vint32m1_t val[3]; } npyv_s32x3; +typedef struct { fixed_vuint64m1_t val[3]; } npyv_u64x3; +typedef struct { fixed_vint64m1_t val[3]; } npyv_s64x3; +typedef struct { fixed_vfloat32m1_t val[3]; } npyv_f32x3; +typedef struct { fixed_vfloat64m1_t val[3]; } npyv_f64x3; + + +// helper types +#define npyv__u8x2 vuint8m1x2_t +#define npyv__u16x2 vuint16m1x2_t +#define npyv__u32x2 vuint32m1x2_t +#define npyv__u64x2 vuint64m1x2_t +#define npyv__s8x2 vint8m1x2_t +#define npyv__s16x2 vint16m1x2_t +#define npyv__s32x2 vint32m1x2_t +#define npyv__s64x2 vint64m1x2_t +#define npyv__f32x2 vfloat32m1x2_t +#define npyv__f64x2 vfloat64m1x2_t + + +#define npyv_nlanes_u8 32 +#define npyv_nlanes_s8 32 +#define npyv_nlanes_u16 16 +#define npyv_nlanes_s16 16 +#define npyv_nlanes_u32 8 +#define npyv_nlanes_s32 8 +#define npyv_nlanes_u64 4 +#define npyv_nlanes_s64 4 +#define npyv_nlanes_f32 8 +#define npyv_nlanes_f64 4 + +#include "memory.h" +#include "misc.h" +#include "reorder.h" +#include "operators.h" +#include "conversion.h" +#include "arithmetic.h" +#include "math.h" diff --git a/numpy/_core/src/common/simd/simd.h b/numpy/_core/src/common/simd/simd.h index fe4ca4da92f5..0bb313bb7300 100644 --- a/numpy/_core/src/common/simd/simd.h +++ b/numpy/_core/src/common/simd/simd.h @@ -91,6 +91,10 @@ typedef double npyv_lanetype_f64; #include "lsx/lsx.h" #endif +#ifdef NPY_HAVE_RVV + #include "rvv/rvv.h" +#endif + #ifndef NPY_SIMD /// SIMD width in bits or 0 if there's no SIMD extension available. #define NPY_SIMD 0 diff --git a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src index d330c21695d5..f8b44ee41909 100644 --- a/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src +++ b/numpy/_core/src/umath/loops_arithmetic.dispatch.c.src @@ -36,7 +36,7 @@ * q = TRUNC((n - (-dsign ) + (-nsign))/d) - (-qsign); ********************************************************************************/ -#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) +#if (defined(NPY_HAVE_VSX) && !defined(NPY_HAVE_VSX4)) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV) // Due to integer 128-bit multiplication emulation, SIMD 64-bit division // may not perform well on both neon and up to VSX3 compared to scalar // division. @@ -452,7 +452,7 @@ NPY_NO_EXPORT int NPY_CPU_DISPATCH_CURFX(@TYPE@_divide_indexed) * Therefore it's better to disable NPYV in this special case to avoid any unnecessary shuffles. * Power10(VSX4) is an exception here since it has native support for integer vector division. */ -#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX)) +#if NPY_BITSOF_@STYPE@ == 64 && !defined(NPY_HAVE_VSX4) && (defined(NPY_HAVE_VSX) || defined(NPY_HAVE_NEON) || defined(NPY_HAVE_LSX) || defined(NPY_HAVE_RVV)) #undef TO_SIMD_SFX #endif NPY_NO_EXPORT void NPY_CPU_DISPATCH_CURFX(@TYPE@_divide) diff --git a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src index e12a98864d96..885ee13c7b86 100755 --- a/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src +++ b/numpy/_core/src/umath/loops_hyperbolic.dispatch.cpp.src @@ -152,11 +152,19 @@ store_vector(vtype vec, type_t* dst, npy_intp sdst, npy_intp len){ #if NPY_SIMD_F64 [[maybe_unused]] HWY_ATTR NPY_FINLINE vec_f64 lut_16_f64(const double * lut, vec_u64 idx){ +#if defined(NPY_HAVE_RVV) + if (hn::MaxLanes(f64) == 8){ +#else if constexpr(hn::MaxLanes(f64) == 8){ +#endif const vec_f64 lut0 = hn::Load(f64, lut); const vec_f64 lut1 = hn::Load(f64, lut + 8); return hn::TwoTablesLookupLanes(f64, lut0, lut1, hn::IndicesFromVec(f64, idx)); +#if defined(NPY_HAVE_RVV) + }else if (hn::MaxLanes(f64) == 4){ +#else }else if constexpr (hn::MaxLanes(f64) == 4){ +#endif const vec_f64 lut0 = hn::Load(f64, lut); const vec_f64 lut1 = hn::Load(f64, lut + 4); const vec_f64 lut2 = hn::Load(f64, lut + 8); @@ -371,6 +379,9 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ const int nlanes = hn::Lanes(f64); const vec_f64 qnan = hn::Set(f64, NPY_NAN); +#if defined(NPY_HAVE_RVV) + vec_f64 vec0,vec1; +#endif for (; len > 0; len -= nlanes, src += ssrc*nlanes, dst += sdst*nlanes) { vec_f64 x = load_vector(src, ssrc, len); @@ -391,6 +402,22 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ // transpose the coef. into lanes. 2 lane transpose is all that's // implemented so we require `npyv_nlanes_f64` == 2. vec_f64 b, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16; +#if defined(NPY_HAVE_RVV) + if (hn::MaxLanes(f64) == 2){ + uint64_t index[2]; + hn::StoreU(idx, u64, index); + + /**begin repeat + * #off= 0, 2, 4, 6, 8, 10, 12, 14, 16# + * #e0 = b, c1, c3, c5, c7, c9, c11, c13, c15# + * #e1 = c0,c2, c4, c6, c8, c10,c12, c14, c16# + */ + vec0 = hn::LoadU(f64, (const double*)lut18x16 + index[0] * 18 + @off@); + vec1 = hn::LoadU(f64, (const double*)lut18x16 + index[1] * 18 + @off@); + @e0@ = hn::ConcatLowerLower(f64, vec1, vec0); + @e1@ = hn::ConcatUpperUpper(f64, vec1, vec0); + /**end repeat**/ +#else if constexpr(hn::MaxLanes(f64) == 2){ vec_f64 e0e1_0, e0e1_1; uint64_t index[hn::Lanes(f64)]; @@ -406,6 +433,7 @@ simd_tanh_f64(const double *src, npy_intp ssrc, double *dst, npy_intp sdst, npy_ @e0@ = hn::ConcatLowerLower(f64, e0e1_1, e0e1_0); @e1@ = hn::ConcatUpperUpper(f64, e0e1_1, e0e1_0); /**end repeat**/ +#endif } else { b = lut_16_f64((const double*)lut16x18 + 16*0, idx); c0 = lut_16_f64((const double*)lut16x18 + 1*16, idx); diff --git a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src index 9f7ed6c1dfc4..c824d4aa3ae0 100644 --- a/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src +++ b/numpy/_core/src/umath/loops_unary_fp_le.dispatch.c.src @@ -313,7 +313,7 @@ npyv_pack_isfinite_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, NPY_FINLINE npyv_u32 npyv_signbit_f32(npyv_f32 v) { - return npyv_shri_u32(npyv_reinterpret_u32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1); + return npyv_reinterpret_u32_s32(npyv_shri_s32(npyv_reinterpret_s32_f32(v), (sizeof(npyv_lanetype_f32)*8)-1)); } NPY_FINLINE npyv_u8 npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) @@ -345,7 +345,7 @@ npyv_pack_signbit_f32(npyv_f32 v0, npyv_f32 v1, npyv_f32 v2, npyv_f32 v3) NPY_FINLINE npyv_u64 npyv_signbit_f64(npyv_f64 v) { - return npyv_shri_u64(npyv_reinterpret_u64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1); + return npyv_reinterpret_u64_s64(npyv_shri_s64(npyv_reinterpret_s64_f64(v), (sizeof(npyv_lanetype_f64)*8)-1)); } NPY_FINLINE npyv_u8 npyv_pack_signbit_f64(npyv_f64 v0, npyv_f64 v1, npyv_f64 v2, npyv_f64 v3, @@ -486,10 +486,26 @@ static void simd_unary_@kind@_@TYPE@_@STYPE@_@DTYPE@ op[0 * ostride] = lane[0 * sizeof(npyv_lanetype_@sfx@)]; op[1 * ostride] = lane[1 * sizeof(npyv_lanetype_@sfx@)]; - #if npyv_nlanes_@sfx@ == 4 + #if npyv_nlanes_@sfx@ >= 4 op[2 * ostride] = lane[2 * sizeof(npyv_lanetype_@sfx@)]; op[3 * ostride] = lane[3 * sizeof(npyv_lanetype_@sfx@)]; #endif + #if npyv_nlanes_@sfx@ >= 8 + op[4 * ostride] = lane[4 * sizeof(npyv_lanetype_@sfx@)]; + op[5 * ostride] = lane[5 * sizeof(npyv_lanetype_@sfx@)]; + op[6 * ostride] = lane[6 * sizeof(npyv_lanetype_@sfx@)]; + op[7 * ostride] = lane[7 * sizeof(npyv_lanetype_@sfx@)]; + #endif + #if npyv_nlanes_@sfx@ >= 16 + op[ 8 * ostride] = lane[ 8 * sizeof(npyv_lanetype_@sfx@)]; + op[ 9 * ostride] = lane[ 9 * sizeof(npyv_lanetype_@sfx@)]; + op[10 * ostride] = lane[10 * sizeof(npyv_lanetype_@sfx@)]; + op[11 * ostride] = lane[11 * sizeof(npyv_lanetype_@sfx@)]; + op[12 * ostride] = lane[12 * sizeof(npyv_lanetype_@sfx@)]; + op[13 * ostride] = lane[13 * sizeof(npyv_lanetype_@sfx@)]; + op[14 * ostride] = lane[14 * sizeof(npyv_lanetype_@sfx@)]; + op[15 * ostride] = lane[15 * sizeof(npyv_lanetype_@sfx@)]; + #endif } #undef PACK_FACTOR diff --git a/numpy/_core/tests/test_cpu_dispatcher.py b/numpy/_core/tests/test_cpu_dispatcher.py index c52cd418a08b..38ff010c5144 100644 --- a/numpy/_core/tests/test_cpu_dispatcher.py +++ b/numpy/_core/tests/test_cpu_dispatcher.py @@ -12,7 +12,7 @@ def test_dispatcher(): "SSE2", "SSE41", "AVX2", "VSX", "VSX2", "VSX3", "NEON", "ASIMD", "ASIMDHP", - "VX", "VXE", "LSX" + "VX", "VXE", "LSX", "RVV" ) highest_sfx = "" # no suffix for the baseline all_sfx = []