From 6255055df598d8da19fbcb882112bd214ae5167d Mon Sep 17 00:00:00 2001 From: Leslie Zhai Date: Tue, 12 Dec 2023 12:48:59 +0800 Subject: [PATCH 1/2] Initial loongarch port Co-authored-by: yangwenqing Signed-off-by: Leslie Zhai Signed-off-by: yangwenqing --- CMakeLists.txt | 21 +- README.md | 4 +- cmake/archdetect.cmake | 3 + cmake/cflags-loongarch64.cmake | 19 + cmake/config.h.in | 6 + cmake/platform.cmake | 3 +- src/hs.cpp | 1 + src/hs_valid_platform.c | 2 + src/nfa/loongarch64/shufti.hpp | 75 +++ src/nfa/loongarch64/truffle.hpp | 63 ++ src/nfa/loongarch64/vermicelli.hpp | 130 ++++ src/nfa/shufti_simd.hpp | 2 + src/nfa/truffle_simd.hpp | 2 + src/nfa/vermicelli_simd.cpp | 2 + src/util/arch.h | 4 +- src/util/arch/loongarch64/bitutils.h | 214 +++++++ src/util/arch/loongarch64/cpuid_flags.c | 42 ++ src/util/arch/loongarch64/loongarch64.h | 47 ++ src/util/arch/loongarch64/match.hpp | 120 ++++ src/util/arch/loongarch64/simd_types.h | 39 ++ src/util/arch/loongarch64/simd_utils.h | 448 +++++++++++++ src/util/bitutils.h | 2 + src/util/intrinsics.h | 6 + src/util/match.hpp | 2 + src/util/simd_types.h | 2 + src/util/simd_utils.h | 2 + .../supervector/arch/loongarch64/impl.cpp | 603 ++++++++++++++++++ .../supervector/arch/loongarch64/types.hpp | 34 + src/util/supervector/supervector.hpp | 20 + src/util/target_info.cpp | 1 + unit/internal/simd_utils.cpp | 3 + 31 files changed, 1916 insertions(+), 6 deletions(-) create mode 100644 cmake/cflags-loongarch64.cmake create mode 100644 src/nfa/loongarch64/shufti.hpp create mode 100644 src/nfa/loongarch64/truffle.hpp create mode 100644 src/nfa/loongarch64/vermicelli.hpp create mode 100644 src/util/arch/loongarch64/bitutils.h create mode 100644 src/util/arch/loongarch64/cpuid_flags.c create mode 100644 src/util/arch/loongarch64/loongarch64.h create mode 100644 src/util/arch/loongarch64/match.hpp create mode 100644 src/util/arch/loongarch64/simd_types.h create mode 100644 src/util/arch/loongarch64/simd_utils.h create mode 100644 src/util/supervector/arch/loongarch64/impl.cpp create mode 100644 src/util/supervector/arch/loongarch64/types.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 30c8663e7..74997466e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.18.4) +cmake_minimum_required (VERSION 3.13.4) project (vectorscan C CXX) @@ -127,6 +127,8 @@ elseif (ARCH_ARM32 OR ARCH_AARCH64) include (${CMAKE_MODULE_PATH}/cflags-arm.cmake) elseif (ARCH_PPC64EL) include (${CMAKE_MODULE_PATH}/cflags-ppc64le.cmake) +elseif (ARCH_LOONGARCH64) + include (${CMAKE_MODULE_PATH}/cflags-loongarch64.cmake) else () message(FATAL_ERROR "Unsupported platform") endif () @@ -160,6 +162,11 @@ foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") endforeach () +if (ARCH_LOONGARCH64) + set(ARCH_C_FLAGS "-mlsx") + set(ARCH_CXX_FLAGS "-mlsx") +endif(ARCH_LOONGARCH64) + message(STATUS "ARCH_C_FLAGS : ${ARCH_C_FLAGS}") message(STATUS "ARCH_CXX_FLAGS : ${ARCH_CXX_FLAGS}") @@ -186,7 +193,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") # PCRE check, we have a fixed requirement for PCRE to use Chimera # and hscollider set(PCRE_REQUIRED_MAJOR_VERSION 8) -set(PCRE_REQUIRED_MINOR_VERSION 41) +set(PCRE_REQUIRED_MINOR_VERSION 39) set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) include (${CMAKE_MODULE_PATH}/pcre.cmake) if (NOT CORRECT_PCRE_VERSION) @@ -264,6 +271,11 @@ elseif (ARCH_PPC64EL) set (hs_exec_common_SRCS ${hs_exec_common_SRCS} src/util/arch/ppc64el/cpuid_flags.c) +elseif (ARCH_LOONGARCH64) +set (hs_exec_common_SRCS + ${hs_exec_common_SRCS} + src/util/arch/loongarch64/cpuid_flags.c + ) endif () set (hs_exec_SRCS @@ -427,6 +439,11 @@ set (hs_exec_SRCS ${hs_exec_SRCS} src/nfa/vermicelli_simd.cpp src/util/supervector/arch/ppc64el/impl.cpp) +elseif (ARCH_LOONGARCH64) +set (hs_exec_SRCS + ${hs_exec_SRCS} + src/nfa/vermicelli_simd.cpp + src/util/supervector/arch/loongarch64/impl.cpp) endif() if (ARCH_IA32 OR ARCH_X86_64) diff --git a/README.md b/README.md index 7f7c2f531..32f3e9715 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # About Vectorscan -A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD -and Power VSX are 100% functional. ARM SVE2 support is in ongoing with +A fork of Intel's Hyperscan, modified to run on more platforms. Currently ARM NEON/ASIMD, +Power VSX and LoongArch LSX are 100% functional. ARM SVE2 support is in ongoing with access to hardware now. More platforms will follow in the future. Further more, starting 5.4.12 there is now a [SIMDe](https://github.com/simd-everywhere/simde) port, which can be either used for platforms without official SIMD support, diff --git a/cmake/archdetect.cmake b/cmake/archdetect.cmake index 494269c29..1aa03a96d 100644 --- a/cmake/archdetect.cmake +++ b/cmake/archdetect.cmake @@ -90,6 +90,9 @@ else() elseif(ARCH_PPC64EL) set(GNUCC_ARCH power8) set(TUNE_FLAG power8) + elseif(ARCH_LOONGARCH64) + set(GNUCC_ARCH la464) + set(TUNE_FLAG generic) else() set(GNUCC_ARCH native) set(TUNE_FLAG native) diff --git a/cmake/cflags-loongarch64.cmake b/cmake/cflags-loongarch64.cmake new file mode 100644 index 000000000..1af7312f3 --- /dev/null +++ b/cmake/cflags-loongarch64.cmake @@ -0,0 +1,19 @@ + +CHECK_INCLUDE_FILE_CXX(lsxintrin.h HAVE_C_LOONGARCH64_LSXINTRIN_H) + +if (HAVE_C_LOONGARCH64_LSXINTRIN_H) + set (INTRIN_INC_H "lsxintrin.h") +else() + message (FATAL_ERROR "No intrinsics header found for LSX") +endif () + +set(CMAKE_REQUIRED_FLAGS "-mlsx") +CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> +int main() { + __m128i a = __lsx_vreplgr2vr_w(1); + (void)a; +}" HAVE_LSX) + +if (NOT HAVE_LSX) + message(FATAL_ERROR "LSX support required for LoongArch support") +endif () diff --git a/cmake/config.h.in b/cmake/config.h.in index dbd72445c..824ef0f93 100644 --- a/cmake/config.h.in +++ b/cmake/config.h.in @@ -24,6 +24,9 @@ /* "Define if building for PPC64EL" */ #cmakedefine ARCH_PPC64EL +/* "Define if building for LOONGARCH64" */ +#cmakedefine ARCH_LOONGARCH64 + /* "Define if cross compiling for AARCH64" */ #cmakedefine CROSS_COMPILE_AARCH64 @@ -81,6 +84,9 @@ /* C compiler has arm_neon.h */ #cmakedefine HAVE_C_PPC64EL_ALTIVEC_H +/* C compiler has lsxintrin.h */ +#cmakedefine HAVE_C_LOONGARCH64_LSXINTRIN_H + /* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to 0 if you don't. */ #cmakedefine HAVE_DECL_PTHREAD_SETAFFINITY_NP diff --git a/cmake/platform.cmake b/cmake/platform.cmake index 30f6da92d..751d8fef1 100644 --- a/cmake/platform.cmake +++ b/cmake/platform.cmake @@ -5,7 +5,8 @@ CHECK_C_SOURCE_COMPILES("#if !(defined(__i386__) || defined(_M_IX86))\n#error no CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_A64)\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_AARCH64) CHECK_C_SOURCE_COMPILES("#if !defined(__ARM_ARCH_ISA_ARM)\n#error not 32bit\n#endif\nint main(void) { return 0; }" ARCH_ARM32) CHECK_C_SOURCE_COMPILES("#if !defined(__PPC64__) && !(defined(__LITTLE_ENDIAN__) && defined(__VSX__))\n#error not ppc64el\n#endif\nint main(void) { return 0; }" ARCH_PPC64EL) -if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL) +CHECK_C_SOURCE_COMPILES("#if !(defined(__loongarch_lp64) || defined( __loongarch64))\n#error not 64bit\n#endif\nint main(void) { return 0; }" ARCH_LOONGARCH64) +if (ARCH_X86_64 OR ARCH_AARCH64 OR ARCH_PPC64EL OR ARCH_LOONGARCH64) set(ARCH_64_BIT TRUE) else() set(ARCH_32_BIT TRUE) diff --git a/src/hs.cpp b/src/hs.cpp index 61e46148c..5c4705e50 100644 --- a/src/hs.cpp +++ b/src/hs.cpp @@ -48,6 +48,7 @@ #if defined(ARCH_IA32) || defined(ARCH_X86_64) #include "util/arch/x86/cpuid_inline.h" #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#elif defined(ARCH_LOONGARCH64) #endif #include "util/depth.h" #include "util/popcount.h" diff --git a/src/hs_valid_platform.c b/src/hs_valid_platform.c index 74a8fc1ec..7d0c2ab0a 100644 --- a/src/hs_valid_platform.c +++ b/src/hs_valid_platform.c @@ -55,5 +55,7 @@ hs_error_t HS_CDECL hs_valid_platform(void) { } #elif defined(ARCH_PPC64EL) || defined(VS_SIMDE_BACKEND) return HS_SUCCESS; +#elif defined(ARCH_LOONGARCH64) + return HS_SUCCESS; #endif } diff --git a/src/nfa/loongarch64/shufti.hpp b/src/nfa/loongarch64/shufti.hpp new file mode 100644 index 000000000..ce02a1953 --- /dev/null +++ b/src/nfa/loongarch64/shufti.hpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Shufti: character class acceleration. + */ + +template +static really_inline +const SuperVector blockSingleMask(SuperVector mask_lo, SuperVector mask_hi, SuperVector chars) { + const SuperVector low4bits = SuperVector::dup_u8(0xf); + + SuperVector c_lo = chars & low4bits; + SuperVector c_hi = chars.template vshr_8_imm<4>(); + c_lo = mask_lo.template pshufb(c_lo); + c_hi = mask_hi.template pshufb(c_hi); + + return (c_lo & c_hi) > (SuperVector::Zeroes()); +} + +template +static really_inline +SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, SuperVector mask2_lo, SuperVector mask2_hi, SuperVector chars) { + + const SuperVector low4bits = SuperVector::dup_u8(0xf); + SuperVector chars_lo = chars & low4bits; + chars_lo.print8("chars_lo"); + SuperVector chars_hi = chars.template vshr_64_imm<4>() & low4bits; + chars_hi.print8("chars_hi"); + SuperVector c1_lo = mask1_lo.template pshufb(chars_lo); + c1_lo.print8("c1_lo"); + SuperVector c1_hi = mask1_hi.template pshufb(chars_hi); + c1_hi.print8("c1_hi"); + SuperVector t1 = c1_lo | c1_hi; + t1.print8("t1"); + + SuperVector c2_lo = mask2_lo.template pshufb(chars_lo); + c2_lo.print8("c2_lo"); + SuperVector c2_hi = mask2_hi.template pshufb(chars_hi); + c2_hi.print8("c2_hi"); + SuperVector t2 = c2_lo | c2_hi; + t2.print8("t2"); + t2.template vshr_128_imm<1>().print8("t2.vshr_128(1)"); + SuperVector t = t1 | (t2.template vshr_128_imm<1>()); + t.print8("t"); + + return !t.eq(SuperVector::Ones()); +} diff --git a/src/nfa/loongarch64/truffle.hpp b/src/nfa/loongarch64/truffle.hpp new file mode 100644 index 000000000..a796d0fc2 --- /dev/null +++ b/src/nfa/loongarch64/truffle.hpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Truffle: character class acceleration. + * + */ + +template +static really_inline +const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, SuperVector shuf_mask_lo_highset, SuperVector chars) { + + chars.print8("chars"); + shuf_mask_lo_highclear.print8("shuf_mask_lo_highclear"); + shuf_mask_lo_highset.print8("shuf_mask_lo_highset"); + + SuperVector highconst = SuperVector::dup_u8(0x80); + highconst.print8("highconst"); + SuperVector shuf_mask_hi = SuperVector::dup_u64(0x8040201008040201); + shuf_mask_hi.print8("shuf_mask_hi"); + + SuperVector shuf1 = shuf_mask_lo_highclear.pshufb(chars); + shuf1.print8("shuf1"); + SuperVector t1 = chars ^ highconst; + t1.print8("t1"); + SuperVector shuf2 = shuf_mask_lo_highset.pshufb(t1); + shuf2.print8("shuf2"); + SuperVector t2 = highconst.opandnot(chars.template vshr_64_imm<4>()); + t2.print8("t2"); + SuperVector shuf3 = shuf_mask_hi.pshufb(t2); + shuf3.print8("shuf3"); + SuperVector res = (shuf1 | shuf2) & shuf3; + res.print8("(shuf1 | shuf2) & shuf3"); + + return !res.eq(SuperVector::Zeroes()); +} diff --git a/src/nfa/loongarch64/vermicelli.hpp b/src/nfa/loongarch64/vermicelli.hpp new file mode 100644 index 000000000..a30f7fc08 --- /dev/null +++ b/src/nfa/loongarch64/vermicelli.hpp @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Vermicelli: single-byte and double-byte acceleration. + */ + +template +static really_inline +const u8 *vermicelliBlock(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = chars.eq(casemask & data); + return first_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliBlockNeg(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = !chars.eq(casemask & data); + return first_zero_match_inverted(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliBlock(SuperVector const data, SuperVector const chars, SuperVector const casemask, u8 const *buf, u16 const len) { + + SuperVector mask = chars.eq(casemask & data); + return last_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliBlockNeg(SuperVector const data, SuperVector const chars, SuperVector const casemask, const u8 *buf, u16 const len) { + + data.print8("data"); + chars.print8("chars"); + casemask.print8("casemask"); + SuperVector mask = !chars.eq(casemask & data); + mask.print8("mask"); + return last_zero_match_inverted(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, + u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { + + SuperVector v = casemask & data; + SuperVector mask1 = chars1.eq(v); + SuperVector mask2 = chars2.eq(v); + SuperVector mask = mask1 & (mask2 >> 1); + + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } + + return first_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *rvermicelliDoubleBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, SuperVector const casemask, + u8 const c1, u8 const c2, u8 const casechar, u8 const *buf, u16 const len) { + + SuperVector v = casemask & data; + SuperVector mask1 = chars1.eq(v); + SuperVector mask2 = chars2.eq(v); + SuperVector mask = (mask1 << 1)& mask2; + + DEBUG_PRINTF("buf[0] = %02hhx, buf[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (check_partial && ((buf[0] & casechar) == c2) && ((buf[-1] & casechar) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) { + mask = mask | (SuperVector::Ones() >> (S-1)); + } + + return last_non_zero_match(buf, mask, len); +} + +template +static really_inline +const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector const chars1, SuperVector const chars2, + SuperVector const mask1, SuperVector const mask2, + u8 const c1, u8 const c2, u8 const m1, u8 const m2, u8 const *buf, u16 const len) { + + SuperVector v1 = chars1.eq(data & mask1); + SuperVector v2 = chars2.eq(data & mask2); + SuperVector mask = v1 & (v2 >> 1); + + DEBUG_PRINTF("rv[0] = %02hhx, rv[-1] = %02hhx\n", buf[0], buf[-1]); + bool partial_match = (check_partial && ((buf[0] & m2) == c2) && ((buf[-1] & m1) == c1)); + DEBUG_PRINTF("partial = %d\n", partial_match); + if (partial_match) { + mask = mask | ((SuperVector::Ones() >> (S-1)) << (S-1)); + } + + return first_non_zero_match(buf, mask, len); +} + diff --git a/src/nfa/shufti_simd.hpp b/src/nfa/shufti_simd.hpp index feeb54abd..51f8f7a6b 100644 --- a/src/nfa/shufti_simd.hpp +++ b/src/nfa/shufti_simd.hpp @@ -61,6 +61,8 @@ SuperVector blockDoubleMask(SuperVector mask1_lo, SuperVector mask1_hi, #include "arm/shufti.hpp" #elif defined(ARCH_PPC64EL) #include "ppc64el/shufti.hpp" +#elif defined(ARCH_LOONGARCH64) +#include "loongarch64/shufti.hpp" #endif #endif diff --git a/src/nfa/truffle_simd.hpp b/src/nfa/truffle_simd.hpp index c1028156e..9ef34006c 100644 --- a/src/nfa/truffle_simd.hpp +++ b/src/nfa/truffle_simd.hpp @@ -54,6 +54,8 @@ const SuperVector blockSingleMask(SuperVector shuf_mask_lo_highclear, Supe #include "arm/truffle.hpp" #elif defined(ARCH_PPC64EL) #include "ppc64el/truffle.hpp" +#elif defined(ARCH_LOONGARCH64) +#include "loongarch64/truffle.hpp" #endif #endif diff --git a/src/nfa/vermicelli_simd.cpp b/src/nfa/vermicelli_simd.cpp index 67ac1dac8..48de5d0e3 100644 --- a/src/nfa/vermicelli_simd.cpp +++ b/src/nfa/vermicelli_simd.cpp @@ -80,6 +80,8 @@ const u8 *vermicelliDoubleMaskedBlock(SuperVector const data, SuperVector #include "arm/vermicelli.hpp" #elif defined(ARCH_PPC64EL) #include "ppc64el/vermicelli.hpp" +#elif defined(ARCH_LOONGARCH64) +#include "loongarch64/vermicelli.hpp" #endif #endif diff --git a/src/util/arch.h b/src/util/arch.h index 1e8d2fbd4..fb91aa53e 100644 --- a/src/util/arch.h +++ b/src/util/arch.h @@ -41,7 +41,9 @@ #include "util/arch/arm/arm.h" #elif defined(ARCH_PPC64EL) #include "util/arch/ppc64el/ppc64el.h" +#elif defined(ARCH_LOONGARCH64) +#include "util/arch/loongarch64/loongarch64.h" #endif -#endif // UTIL_ARCH_X86_H_ +#endif // UTIL_ARCH_H_ diff --git a/src/util/arch/loongarch64/bitutils.h b/src/util/arch/loongarch64/bitutils.h new file mode 100644 index 000000000..c2e346bfd --- /dev/null +++ b/src/util/arch/loongarch64/bitutils.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Bit-twiddling primitives (ctz, compress etc) + */ + +#ifndef BITUTILS_ARCH_LOONGARCH64_H +#define BITUTILS_ARCH_LOONGARCH64_H + +#include "ue2common.h" +#include "util/popcount.h" +#include "util/arch.h" +#include "util/intrinsics.h" + +#include "util/arch/common/bitutils.h" + +static really_inline +u32 clz32_impl(u32 x) { + return clz32_impl_c(x); +} + +static really_inline +u32 clz64_impl(u64a x) { + return clz64_impl_c(x); +} + +static really_inline +u32 ctz32_impl(u32 x) { + return ctz32_impl_c(x); +} + +static really_inline +u32 ctz64_impl(u64a x) { + return ctz64_impl_c(x); +} + +static really_inline +u32 lg2_impl(u32 x) { + return lg2_impl_c(x); +} + +static really_inline +u64a lg2_64_impl(u64a x) { + return lg2_64_impl_c(x); +} + +static really_inline +u32 findAndClearLSB_32_impl(u32 *v) { + return findAndClearLSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearLSB_64_impl(u64a *v) { + return findAndClearLSB_64_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_32_impl(u32 *v) { + return findAndClearMSB_32_impl_c(v); +} + +static really_inline +u32 findAndClearMSB_64_impl(u64a *v) { + return findAndClearMSB_64_impl_c(v); +} + +static really_inline +u32 compress32_impl(u32 x, u32 m) { + return compress32_impl_c(x, m); +} + +static really_inline +u64a compress64_impl(u64a x, u64a m) { + return compress64_impl_c(x, m); +} + +static really_inline +m128 compress128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bb = one; + m128 res = zeroes128(); + while (isnonzero128(m)) { + m128 mm = sub_2x64(zeroes128(), m); + m128 xm = and128(x, m); + xm = and128(xm, mm); + + m128 mask = not128(eq64_m128(xm, zeroes128())); + res = or128(res, and128(bb, mask)); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); + } + return res; +} + + +#if defined(HAVE_SVE2_BITPERM) +#include "bitutils_sve.h" +#else + +static really_inline +u32 expand32_impl(u32 x, u32 m) { + return expand32_impl_c(x, m); +} + +static really_inline +u64a expand64_impl(u64a x, u64a m) { + return expand64_impl_c(x, m); +} + +#endif // HAVE_SVE2_BITPERM + +static really_inline +m128 expand128_impl(m128 x, m128 m) { + m128 one = set1_2x64(1); + m128 bb = one; + m128 res = zeroes128(); + while (isnonzero128(m)) { + m128 xm = and128(x, bb); + m128 mm = sub_2x64(zeroes128(), m); + m128 mask = not128(eq64_m128(xm, zeroes128())); + mask = and128(mask, and128(m, mm)); + res = or128(res, mask); + m = and128(m, sub_2x64(m, one)); + bb = lshift64_m128(bb, 1); + } + return res; +} + +/* returns the first set bit after begin (if not ~0U). If no bit is set after + * begin returns ~0U + */ +static really_inline +u32 bf64_iterate_impl(u64a bitfield, u32 begin) { + if (begin != ~0U) { + /* switch off all bits at or below begin. Note: not legal to shift by + * by size of the datatype or larger. */ + assert(begin <= 63); + bitfield &= ~((2ULL << begin) - 1); + } + + if (!bitfield) { + return ~0U; + } + + return ctz64_impl(bitfield); +} + +static really_inline +char bf64_set_impl(u64a *bitfield, u32 i) { + return bf64_set_impl_c(bitfield, i); +} + +static really_inline +void bf64_unset_impl(u64a *bitfield, u32 i) { + return bf64_unset_impl_c(bitfield, i); +} + +static really_inline +u32 rank_in_mask32_impl(u32 mask, u32 bit) { + return rank_in_mask32_impl_c(mask, bit); +} + +static really_inline +u32 rank_in_mask64_impl(u64a mask, u32 bit) { + return rank_in_mask64_impl_c(mask, bit); +} + +static really_inline +u32 pext32_impl(u32 x, u32 mask) { + return pext32_impl_c(x, mask); +} + +static really_inline +u64a pext64_impl(u64a x, u64a mask) { + return pext64_impl_c(x, mask); +} + +/* compilers don't reliably synthesize the 32-bit ANDN instruction here, + * so we force its generation. + */ +static really_inline +u64a andn_impl(const u32 a, const u8 *b) { + return andn_impl_c(a, b); +} + +#endif // BITUTILS_ARCH_LOONGARCH64_H diff --git a/src/util/arch/loongarch64/cpuid_flags.c b/src/util/arch/loongarch64/cpuid_flags.c new file mode 100644 index 000000000..ffc6c1f30 --- /dev/null +++ b/src/util/arch/loongarch64/cpuid_flags.c @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include "util/arch/common/cpuid_flags.h" +#include "ue2common.h" +#include "hs_compile.h" // for HS_MODE_ flags +#include "util/arch.h" + +u64a cpuid_flags(void) { + return 0; +} + +u32 cpuid_tune(void) { + return HS_TUNE_FAMILY_GENERIC; +} diff --git a/src/util/arch/loongarch64/loongarch64.h b/src/util/arch/loongarch64/loongarch64.h new file mode 100644 index 000000000..3e093a998 --- /dev/null +++ b/src/util/arch/loongarch64/loongarch64.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2017-2020, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief Per-platform architecture definitions + */ + +#ifndef UTIL_ARCH_LOONGARCH64_H_ +#define UTIL_ARCH_LOONGARCH64_H_ + +#define HAVE_LSX +#define HAVE_SIMD_128_BITS + +#if defined(HAVE_SIMD_128_BITS) +#define CHUNKSIZE 128 +#define VECTORSIZE 16 +#endif + +#endif // UTIL_ARCH_LOONGARCH64_H_ + diff --git a/src/util/arch/loongarch64/match.hpp b/src/util/arch/loongarch64/match.hpp new file mode 100644 index 000000000..78651edc9 --- /dev/null +++ b/src/util/arch/loongarch64/match.hpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +static really_inline m128 vpmax_loongarch(v4u32 a, v4u32 b) { + u32 result[4]; + u32 tmp1 = __lsx_vpickve2gr_wu(a, 0); + u32 tmp2 = __lsx_vpickve2gr_wu(a, 1); + result[0] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(a, 2); + tmp2 = __lsx_vpickve2gr_wu(a, 3); + result[1] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(b, 0); + tmp2 = __lsx_vpickve2gr_wu(b, 1); + result[2] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(b, 2); + tmp2 = __lsx_vpickve2gr_wu(b, 3); + result[3] = (tmp1 >= tmp2) ? tmp1 : tmp2; + v4u32 res = __lsx_vld((uint32_t *)result, 0); + return res; +} + +template <> +really_really_inline +const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) { + v4u32 m = mask.u.u32x4[0]; + uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0); + if (vmax != 0) { + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 16); + DEBUG_PRINTF("buf + pos %p\n", buf + (pos)); + return buf + pos; + } else { + return NULL; // no match + } +} + +template <> +really_really_inline +const u8 *last_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) { + v4u32 m = mask.u.u32x4[0]; + uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0); + if (vmax != 0) { + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); + DEBUG_PRINTF("match @ pos %u\n", pos); + return buf + (15 - pos); + } else { + return NULL; // no match + } +} + +template <> +really_really_inline +const u8 *first_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) { + v4u32 m = mask.u.u32x4[0]; + uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0); + if (vmax != 0) { + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("z %08llx\n", z); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + u32 pos = ctz64(z) / SuperVector<16>::mask_width(); + DEBUG_PRINTF("match @ pos %u\n", pos); + assert(pos < 16); + DEBUG_PRINTF("buf + pos %p\n", buf + pos); + return buf + pos; + } else { + return NULL; // no match + } +} + +template <> +really_really_inline +const u8 *last_zero_match_inverted<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) { + v4u32 m = mask.u.u32x4[0]; + uint64_t vmax = __lsx_vpickve2gr_du(vpmax_loongarch(m, m), 0); + if (vmax != 0) { + typename SuperVector<16>::comparemask_type z = mask.comparemask(); + DEBUG_PRINTF("buf %p z %08llx \n", buf, z); + DEBUG_PRINTF("z %08llx\n", z); + u32 pos = clz64(z) / SuperVector<16>::mask_width(); + DEBUG_PRINTF("match @ pos %u\n", pos); + return buf + (15 - pos); + } else { + return NULL; // no match + } +} + diff --git a/src/util/arch/loongarch64/simd_types.h b/src/util/arch/loongarch64/simd_types.h new file mode 100644 index 000000000..b9e18e348 --- /dev/null +++ b/src/util/arch/loongarch64/simd_types.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_TYPES_LOONGARCH64_H +#define SIMD_TYPES_LOONGARCH64_H + +#if !defined(m128) && defined(HAVE_LSX) +typedef v4i32 m128; +#endif + +#endif /* SIMD_TYPES_LOONGARCH64_H */ + diff --git a/src/util/arch/loongarch64/simd_utils.h b/src/util/arch/loongarch64/simd_utils.h new file mode 100644 index 000000000..9a207d366 --- /dev/null +++ b/src/util/arch/loongarch64/simd_utils.h @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2015-2020, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/** \file + * \brief SIMD types and primitive operations. + */ + +#ifndef ARCH_LOONGARCH64_SIMD_UTILS_H +#define ARCH_LOONGARCH64_SIMD_UTILS_H + +#include +#include + +#include "ue2common.h" +#include "util/simd_types.h" +#include "util/unaligned.h" +#include "util/intrinsics.h" + +#include // for memcpy + +static really_inline m128 ones128(void) { + return __lsx_vreplgr2vr_b(0xFF); +} + +static really_inline m128 zeroes128(void) { + return __lsx_vreplgr2vr_w(0); +} + +/** \brief Bitwise not for m128*/ +static really_inline m128 not128(m128 a) { + return __lsx_vxor_v(a, ones128()); +} + +/** \brief Return 1 if a and b are different otherwise 0 */ +static really_inline int diff128(m128 a, m128 b) { + uint64_t res = __lsx_vpickve2gr_du(__lsx_vsrlni_b_h(zeroes128(), __lsx_vseq_w(a, b), 4), 0); + return (~0ull != res); +} + +static really_inline int isnonzero128(m128 a) { + return diff128(a, zeroes128()); +} + +/** + * "Rich" version of diff128(). Takes two vectors a and b and returns a 4-bit + * mask indicating which 32-bit words contain differences. + */ +static really_inline u32 diffrich128(m128 a, m128 b) { + static const v4u32 movemask = { 1, 2, 4, 8 }; + m128 tmp = __lsx_vand_v(not128(__lsx_vseq_w(a, b)), movemask); + return __lsx_vpickve2gr_wu(tmp, 0) + __lsx_vpickve2gr_wu(tmp, 1) + + __lsx_vpickve2gr_wu(tmp, 2) + __lsx_vpickve2gr_wu(tmp, 3); +} + +/** + * "Rich" version of diff128(), 64-bit variant. Takes two vectors a and b and + * returns a 4-bit mask indicating which 64-bit words contain differences. + */ +static really_inline u32 diffrich64_128(m128 a, m128 b) { + static const v2u64 movemask = { 1, 4 }; + m128 tmp = __lsx_vand_v(not128(__lsx_vseq_d(a, b)), movemask); + return __lsx_vpickve2gr_du(tmp, 0) + __lsx_vpickve2gr_du(tmp, 1); +} + +static really_really_inline +m128 add_2x64(m128 a, m128 b) { + return __lsx_vadd_d(a, b); +} + +static really_really_inline +m128 sub_2x64(m128 a, m128 b) { + return __lsx_vsub_d(a, b); +} + +static really_inline +m128 lshift_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return __lsx_vslli_w(a, b); + } +#endif + v4i32_w shift_indices = __lsx_vreplgr2vr_w(b); + return __lsx_vsll_w(a, shift_indices); +} + +static really_really_inline +m128 rshift_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return __lsx_vsrli_w(a, b); + } +#endif + v4i32 shift_indices = __lsx_vreplgr2vr_w(b); + return __lsx_vsrl_w(a, shift_indices); +} + +static really_really_inline +m128 lshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return __lsx_vslli_d(a, b); + } +#endif + v2i64 shift_indices = __lsx_vreplgr2vr_d(b); + return __lsx_vsll_d(a, shift_indices); +} + +static really_really_inline +m128 rshift64_m128(m128 a, unsigned b) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(b)) { + return __lsx_vsrl_d(a, b); + } +#endif + v2i64 shift_indices = __lsx_vreplgr2vr_d(b); + return __lsx_vsrl_d(a, shift_indices); +} + +static really_inline m128 eq128(m128 a, m128 b) { + return __lsx_vseq_b(a, b); +} + +static really_inline m128 eq64_m128(m128 a, m128 b) { + return __lsx_vseq_d(a, b); +} + +static really_inline u32 movemask128(m128 a) { + v16u8 input = (v16u8) a; + v8u16 high_bits = (v8u16) __lsx_vsrli_b(input, 7); + v4u32 paired16 = (v4u32) __lsx_vadd_h(high_bits, __lsx_vsrli_h(high_bits, 7)); + v2u64 paired32 = (v2u64) __lsx_vadd_w(paired16, __lsx_vsrli_w(paired16, 14)); + v16u8 paired64 = (v16u8) __lsx_vadd_d(paired32, __lsx_vsrli_d(paired32, 28)); + return __lsx_vpickve2gr_bu(paired64, 0) | ((int) __lsx_vpickve2gr_bu(paired64, 8) << 8); +} + +static really_inline m128 set1_16x8(u8 c) { + return __lsx_vreplgr2vr_b(c); +} + +static really_inline m128 set1_4x32(u32 c) { + return __lsx_vreplgr2vr_w(c); +} + +static really_inline m128 set1_2x64(u64a c) { + return __lsx_vreplgr2vr_d(c); +} + +static really_inline u32 movd(const m128 in) { + return __lsx_vpickve2gr_wu(in, 0); +} + +static really_inline u64a movq(const m128 in) { + return __lsx_vpickve2gr_du(in, 0); +} + +/* another form of movq */ +static really_inline +m128 load_m128_from_u64a(const u64a *p) { + m128 tmp = zeroes128(); + return __lsx_vinsgr2vr_d(tmp, *p, 0); +} + +static really_inline u32 extract32from128(const m128 in, unsigned imm) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(imm)) { + return __lsx_vpickve2gr_wu(in, imm); + } +#endif + switch (imm) { + case 0: + return __lsx_vpickve2gr_wu(in, 0); + break; + case 1: + return __lsx_vpickve2gr_wu(in, 1); + break; + case 2: + return __lsx_vpickve2gr_wu(in, 2); + break; + case 3: + return __lsx_vpickve2gr_wu(in, 3); + break; + default: + return 0; + break; + } +} + +static really_inline u64a extract64from128(const m128 in, unsigned imm) { +#if defined(HAVE__BUILTIN_CONSTANT_P) + if (__builtin_constant_p(imm)) { + return __lsx_vpickve2gr_du(in, imm); + } +#endif + switch (imm) { + case 0: + return __lsx_vpickve2gr_du(in, 0); + break; + case 1: + return __lsx_vpickve2gr_du(in, 1); + break; + default: + return 0; + break; + } +} + +static really_inline m128 low64from128(const m128 in) { + m128 ret = zeroes128(); + __lsx_vinsgr2vr_d(ret, __lsx_vpickve2gr_d(in, 0), 0); + return ret; +} + +static really_inline m128 high64from128(const m128 in) { + m128 ret = zeroes128(); + __lsx_vinsgr2vr_d(ret, __lsx_vpickve2gr_d(in, 1), 0); + return ret; +} + +static really_inline m128 add128(m128 a, m128 b) { + return __lsx_vadd_q(a, b); +} + +static really_inline m128 and128(m128 a, m128 b) { + return __lsx_vand_v(a, b); +} + +static really_inline m128 xor128(m128 a, m128 b) { + return __lsx_vxor_v(a, b); +} + +static really_inline m128 or128(m128 a, m128 b) { + return __lsx_vor_v(a, b); +} + +static really_inline m128 andnot128(m128 a, m128 b) { + return __lsx_vandn_v(a, b); +} + +// aligned load +static really_inline m128 load128(const void *ptr) { + assert(ISALIGNED_N(ptr, alignof(m128))); + return __lsx_vld((const int32_t *)ptr, 0); +} + +// aligned store +static really_inline void store128(void *ptr, m128 a) { + assert(ISALIGNED_N(ptr, alignof(m128))); + __lsx_vst(a, (int32_t *)ptr, 0); +} + +// unaligned load +static really_inline m128 loadu128(const void *ptr) { + return __lsx_vld((const int32_t *)ptr, 0); +} + +// unaligned store +static really_inline void storeu128(void *ptr, m128 a) { + __lsx_vst(a, (int32_t *)ptr, 0); +} + +// packed unaligned store of first N bytes +static really_inline +void storebytes128(void *ptr, m128 a, unsigned int n) { + assert(n <= sizeof(a)); + memcpy(ptr, &a, n); +} + +// packed unaligned load of first N bytes, pad with zero +static really_inline +m128 loadbytes128(const void *ptr, unsigned int n) { + m128 a = zeroes128(); + assert(n <= sizeof(a)); + memcpy(&a, ptr, n); + return a; +} + +static really_inline m128 case_algin_vectors(m128 a,m128 b,int offset) { + u8 index_shuf[16]; + for(int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)offset; + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0); + return __lsx_vshuf_b(b, a, index); +} +static really_really_inline +m128 palignr_imm(m128 r, m128 l, int offset) { + switch (offset) { + case 0: return l; break; + case 1: return case_algin_vectors(l, r, 1); break; + case 2: return case_algin_vectors(l, r, 2); break; + case 3: return case_algin_vectors(l, r, 3); break; + case 4: return case_algin_vectors(l, r, 4); break; + case 5: return case_algin_vectors(l, r, 5); break; + case 6: return case_algin_vectors(l, r, 6); break; + case 7: return case_algin_vectors(l, r, 7); break; + case 8: return case_algin_vectors(l, r, 8); break; + case 9: return case_algin_vectors(l, r, 9); break; + case 10: return case_algin_vectors(l, r, 10); break; + case 11: return case_algin_vectors(l, r, 11); break; + case 12: return case_algin_vectors(l, r, 12); break; + case 13: return case_algin_vectors(l, r, 13); break; + case 14: return case_algin_vectors(l, r, 14); break; + case 15: return case_algin_vectors(l, r, 15); break; + case 16: return r; break; + default: + return zeroes128(); + break; + } +} + +static really_really_inline +m128 palignr(m128 r, m128 l, int offset) { + +#if defined(HAVE__BUILTIN_CONSTANT_P) + u8 index_shuf[16]; + for (int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)offset; + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0); + if (__builtin_constant_p(index)) { + return __lsx_vshuf_b(r, l, index); + } +#endif + return palignr_imm(r, l, offset); +} +//#undef CASE_ALIGN_VECTORS + +static really_really_inline +m128 rshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } + return palignr(zeroes128(), a, b); +} + +static really_really_inline +m128 lshiftbyte_m128(m128 a, unsigned b) { + if (b == 0) { + return a; + } + return palignr(a, zeroes128(), 16 - b); +} + +static really_inline +m128 variable_byte_shift_m128(m128 in, s32 amount) { + assert(amount >= -16 && amount <= 16); + if (amount < 0) { + return palignr_imm(zeroes128(), in, -amount); + } else { + return palignr_imm(in, zeroes128(), 16 - amount); + } +} + +static really_inline +m128 mask1bit128(unsigned int n) { + assert(n < sizeof(m128) * 8); + static m128 onebit = { 1, 0 }; + m128 mask = lshiftbyte_m128( onebit, n / 8 ); + return lshift64_m128( mask, n % 8 ); +} + +// switches on bit N in the given vector. +static really_inline +void setbit128(m128 *ptr, unsigned int n) { + *ptr = or128(mask1bit128(n), *ptr); +} + +// switches off bit N in the given vector. +static really_inline +void clearbit128(m128 *ptr, unsigned int n) { + *ptr = andnot128(mask1bit128(n), *ptr); +} + +// tests bit N in the given vector. +static really_inline +char testbit128(m128 val, unsigned int n) { + const m128 mask = mask1bit128(n); + return isnonzero128(and128(mask, val)); +} + +static really_inline +m128 pshufb_m128(m128 a, m128 b) { + v16u8 tmp = __lsx_vand_v((v16u8)b,__lsx_vreplgr2vr_b(0x8f)); + return __lsx_vshuf_b(zeroes128(),a, tmp); +} + +static really_inline +m128 max_u8_m128(m128 a, m128 b) { + return __lsx_vmax_bu(a, b); +} + +static really_inline +m128 min_u8_m128(m128 a, m128 b) { + return __lsx_vmin_bu(a, b); +} + +static really_inline +m128 sadd_u8_m128(m128 a, m128 b) { + return __lsx_vsadd_bu(a, b); +} + +static really_inline +m128 sub_u8_m128(m128 a, m128 b) { + return __lsx_vssub_bu(a, b); +} + +static really_inline +m128 set4x32(u32 x3, u32 x2, u32 x1, u32 x0) { + uint32_t ALIGN_ATTR(16) data[4] = { x0, x1, x2, x3 }; + return __lsx_vld((uint32_t *) data, 0); +} + +static really_inline +m128 set2x64(u64a hi, u64a lo) { + uint64_t ALIGN_ATTR(16) data[2] = { lo, hi }; + return __lsx_vld((uint64_t *) data, 0); +} + +#endif // ARCH_LOONGARCH64_SIMD_UTILS_H diff --git a/src/util/bitutils.h b/src/util/bitutils.h index c67d5a85d..eed40aa2a 100644 --- a/src/util/bitutils.h +++ b/src/util/bitutils.h @@ -52,6 +52,8 @@ #include "util/arch/arm/bitutils.h" #elif defined(ARCH_PPC64EL) #include "util/arch/ppc64el/bitutils.h" +#elif defined(ARCH_LOONGARCH64) +#include "util/arch/loongarch64/bitutils.h" #endif #else #include "util/arch/common/bitutils.h" diff --git a/src/util/intrinsics.h b/src/util/intrinsics.h index 64f9e9bad..0c12f31c6 100644 --- a/src/util/intrinsics.h +++ b/src/util/intrinsics.h @@ -53,6 +53,10 @@ # define USE_PPC64EL_ALTIVEC_H #endif +#if defined(HAVE_C_LOONGARCH64_LSXINTRIN_H) +# define USE_LOONGARCH64_LSXINTRIN_H +#endif + #ifdef __cplusplus # if defined(HAVE_CXX_INTRIN_H) # define USE_INTRIN_H @@ -74,6 +78,8 @@ # endif #elif defined(USE_PPC64EL_ALTIVEC_H) #include +#elif defined(USE_LOONGARCH64_LSXINTRIN_H) +#include #endif #endif // INTRINSICS_H diff --git a/src/util/match.hpp b/src/util/match.hpp index 6567b2129..fd0a980be 100644 --- a/src/util/match.hpp +++ b/src/util/match.hpp @@ -58,6 +58,8 @@ const u8 *last_zero_match_inverted(const u8 *buf, SuperVector v, u16 len = S) #include "util/arch/arm/match.hpp" #elif defined(ARCH_PPC64EL) #include "util/arch/ppc64el/match.hpp" +#elif defined(ARCH_LOONGARCH64) +#include "util/arch/loongarch64/match.hpp" #endif #endif diff --git a/src/util/simd_types.h b/src/util/simd_types.h index e393d081a..b665b6795 100644 --- a/src/util/simd_types.h +++ b/src/util/simd_types.h @@ -50,6 +50,8 @@ typedef simde__m128i m128; #include "util/arch/arm/simd_types.h" #elif defined(ARCH_PPC64EL) #include "util/arch/ppc64el/simd_types.h" +#elif defined(ARCH_LOONGARCH64) +#include "util/arch/loongarch64/simd_types.h" #endif diff --git a/src/util/simd_utils.h b/src/util/simd_utils.h index 01c309b1b..401d08658 100644 --- a/src/util/simd_utils.h +++ b/src/util/simd_utils.h @@ -71,6 +71,8 @@ extern const char vbs_mask_data[]; #include "util/arch/arm/simd_utils.h" #elif defined(ARCH_PPC64EL) #include "util/arch/ppc64el/simd_utils.h" +#elif defined(ARCH_LOONGARCH64) +#include "util/arch/loongarch64/simd_utils.h" #endif #endif diff --git a/src/util/supervector/arch/loongarch64/impl.cpp b/src/util/supervector/arch/loongarch64/impl.cpp new file mode 100644 index 000000000..3218d5e61 --- /dev/null +++ b/src/util/supervector/arch/loongarch64/impl.cpp @@ -0,0 +1,603 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#ifndef SIMD_IMPL_HPP +#define SIMD_IMPL_HPP + +#include + +#include "ue2common.h" +#include "util/supervector/supervector.hpp" + +// 128-bit LSX implementation + +template<> +really_inline SuperVector<16>::SuperVector(typename base_type::type const v) +{ + u.v128[0] = v; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v8i16_h other) +{ + u.s8x16[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v8u16_h other) +{ + u.u8x16[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v16i8_b other) +{ + u.s16x8[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v16u8_b other) +{ + u.u16x8[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v4i32_w other) +{ + u.s32x4[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v4u32_w other) +{ + u.u32x4[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v2i64_d other) +{ + u.s64x2[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(v2u64_d other) +{ + u.u64x2[0] = other; +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int8_t const other) +{ + u.s8x16[0] = __lsx_vreplgr2vr_b(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint8_t const other) +{ + u.u8x16[0] = (v16u8)__lsx_vreplgr2vr_b(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int16_t const other) +{ + u.s16x8[0] = __lsx_vreplgr2vr_h(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint16_t const other) +{ + u.u16x8[0] = (v8u16)__lsx_vreplgr2vr_h(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int32_t const other) +{ + u.s32x4[0] = __lsx_vreplgr2vr_w(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint32_t const other) +{ + u.u32x4[0] = (v4u32)__lsx_vreplgr2vr_w(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(int64_t const other) +{ + u.s64x2[0] = __lsx_vreplgr2vr_d(other); +} + +template<> +template<> +really_inline SuperVector<16>::SuperVector(uint64_t const other) +{ + u.u64x2[0] = (v2u64)__lsx_vreplgr2vr_d(other); +} + +// Constants +template<> +really_inline SuperVector<16> SuperVector<16>::Ones(void) +{ + return {__lsx_vreplgr2vr_b(0xFF)}; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Zeroes(void) +{ + return {__lsx_vreplgr2vr_b(0)}; +} + +// Methods + +template <> +really_inline void SuperVector<16>::operator=(SuperVector<16> const &other) +{ + u.v128[0] = other.u.v128[0]; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator&(SuperVector<16> const &b) const +{ + return {__lsx_vand_v(u.u8x16[0], b.u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator|(SuperVector<16> const &b) const +{ + return {__lsx_vor_v(u.u8x16[0], b.u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator^(SuperVector<16> const &b) const +{ + return {__lsx_vxor_v(u.u8x16[0], b.u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator!() const +{ + return {__lsx_vnor_v(u.u8x16[0], u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::opandnot(SuperVector<16> const &b) const +{ + return {__lsx_vand_v(__lsx_vnor_v(u.u8x16[0], u.u8x16[0]), b.u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator==(SuperVector<16> const &b) const +{ + return {__lsx_vseq_b(u.u8x16[0], b.u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator!=(SuperVector<16> const &b) const +{ + return !(*this == b); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>(SuperVector<16> const &b) const +{ + return {__lsx_vslt_b(b.u.s8x16[0], u.s8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>=(SuperVector<16> const &b) const +{ + return {__lsx_vsle_bu(b.u.u8x16[0], u.u8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<(SuperVector<16> const &b) const +{ + return {__lsx_vslt_b(u.s8x16[0], b.u.s8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<=(SuperVector<16> const &b) const +{ + return {__lsx_vsle_b(u.s8x16[0], b.u.s8x16[0])}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::eq(SuperVector<16> const &b) const +{ + return (*this == b); +} + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::comparemask(void) const { + return static_cast::comparemask_type>( + __lsx_vpickve2gr_du(__lsx_vsrlni_b_h(__lsx_vreplgr2vr_w(0), u.u16x8[0], 4), 0)); +} + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::eqmask(SuperVector<16> const b) const { + return eq(b).comparemask(); +} + +template <> really_inline u32 SuperVector<16>::mask_width() { return 4; } + +template <> +really_inline typename SuperVector<16>::comparemask_type +SuperVector<16>::iteration_mask( + typename SuperVector<16>::comparemask_type mask) { + return mask & 0x1111111111111111ull; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_8_imm() const +{ + return {__lsx_vslli_b(u.u8x16[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_16_imm() const +{ + return {__lsx_vslli_h(u.u16x8[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_32_imm() const +{ + return {__lsx_vslli_w(u.u32x4[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_64_imm() const +{ + return {__lsx_vslli_d(u.u64x2[0], N)}; +} + +static really_inline m128 create_index(int offset){ + u8 index_shuf[16]; + for (int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)offset; + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf,0); + return index; +} + + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_128_imm() const +{ + return {__lsx_vshuf_b(u.u8x16[0], __lsx_vreplgr2vr_b(0), create_index(16 - N))}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshl_imm() const +{ + return vshl_128_imm(); +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_8_imm() const +{ + return {__lsx_vsrli_b(u.u8x16[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_16_imm() const +{ + return {__lsx_vsrli_h(u.u16x8[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_32_imm() const +{ + return {__lsx_vsrli_w(u.u32x4[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_64_imm() const +{ + return {__lsx_vsrli_d(u.u64x2[0], N)}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_128_imm() const +{ + return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], create_index(N))}; +} + +template <> +template +really_inline SuperVector<16> SuperVector<16>::vshr_imm() const +{ + return vshr_128_imm(); +} + +#if !defined(HS_OPTIMIZE) +template SuperVector<16> SuperVector<16>::vshl_8_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshl_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshl_128_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_8_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_8_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_16_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_64_imm<4>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<1>() const; +template SuperVector<16> SuperVector<16>::vshr_128_imm<4>() const; +#endif + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_8 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 8) return Zeroes(); + v16i8 shift_indices = __lsx_vreplgr2vr_b(N); + return { __lsx_vsll_b(u.s8x16[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + v8i16 shift_indices = __lsx_vreplgr2vr_h(N); + return { __lsx_vsll_h(u.s16x8[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + v4i32 shift_indices = __lsx_vreplgr2vr_w(N); + return { __lsx_vsll_w(u.s32x4[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + v2i64 shift_indices = __lsx_vreplgr2vr_d(N); + return { __lsx_vsll_d(u.s64x2[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); +#if defined(HAVE__BUILTIN_CONSTANT_P) + u8 index_shuf[16]; + for(int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)(16-N); + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0); + if (__builtin_constant_p(index)) { + return {__lsx_vshuf_b(u.u8x16[0], __lsx_vreplgr2vr_b(0), index)}; + } +#endif + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {__lsx_vshuf_b(v->u.u8x16[0], __lsx_vreplgr2vr_b(0), create_index(16 - n))}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshl(uint8_t const N) const +{ + return vshl_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_8 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 8) return Zeroes(); + v16i8 shift_indices = __lsx_vreplgr2vr_b(N); + return { __lsx_vsrl_b(u.s8x16[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_16 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); + v8i16 shift_indices = __lsx_vreplgr2vr_h(N); + return { __lsx_vsrl_h(u.s16x8[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_32 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 32) return Zeroes(); + v4i32 shift_indices = __lsx_vreplgr2vr_w(N); + return { __lsx_vsrl_w(u.s32x4[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_64 (uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 64) return Zeroes(); + v2i64 shift_indices = __lsx_vreplgr2vr_d(N); + return { __lsx_vsrl_d(u.s64x2[0], shift_indices) }; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr_128(uint8_t const N) const +{ + if (N == 0) return *this; + if (N == 16) return Zeroes(); +#if defined(HAVE__BUILTIN_CONSTANT_P) + u8 index_shuf[16]; + for (int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)N; + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0); + if (__builtin_constant_p(index)) { + return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], index)}; + } +#endif + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (N == n) result = {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), v->u.u8x16[0], create_index(n))}; }); + return result; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::vshr(uint8_t const N) const +{ + return vshr_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator>>(uint8_t const N) const +{ + return vshr_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::operator<<(uint8_t const N) const +{ + return vshl_128(N); +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshr(uint8_t const N) +{ + return Ones().vshr_128(N); +} + +template<> +really_inline SuperVector<16> SuperVector<16>::Ones_vshl(uint8_t const N) +{ + return Ones().vshl_128(N); +} + +template <> +really_inline SuperVector<16> SuperVector<16>::loadu(void const *ptr) +{ + return {__lsx_vld((const int32_t *)ptr, 0)}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::load(void const *ptr) +{ + assert(ISALIGNED_N(ptr, alignof(SuperVector::size))); + ptr = vectorscan_assume_aligned(ptr, SuperVector::size); + return {__lsx_vld((const int32_t *)ptr, 0)}; +} + +template <> +really_inline SuperVector<16> SuperVector<16>::loadu_maskz(void const *ptr, uint8_t const len) +{ + SuperVector mask = Ones_vshr(16 - len); + SuperVector<16> v = loadu(ptr); + return mask & v; +} + +template<> +really_inline SuperVector<16> SuperVector<16>::alignr(SuperVector<16> &other, int8_t offset) +{ + if (offset == 0) return other; + if (offset == 16) return *this; +#if defined(HAVE__BUILTIN_CONSTANT_P) + u8 index_shuf[16]; + for (int i = 0; i < 16; i++) { + index_shuf[i] = (uint8_t)offset; + offset += 1; + } + v16u8 index = __lsx_vld((uint8_t *)index_shuf, 0); + if (__builtin_constant_p(index)) { + return {__lsx_vshuf_b(u.u8x16[0], other.u.u8x16[0], index)}; + } +#endif + SuperVector result; + Unroller<1, 16>::iterator([&,v=this](auto const i) { constexpr uint8_t n = i.value; if (offset == n) result = {__lsx_vshuf_b(v->u.u8x16[0], other.u.u8x16[0], create_index(n))}; }); + return result; +} + +template<> +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + return {__lsx_vshuf_b(__lsx_vreplgr2vr_b(0), u.u8x16[0], b.u.u8x16[0])}; +} + +template<> +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb(SuperVector<16> b) +{ + /* On Intel, if bit 0x80 is set, then result is zero, otherwise which the lane it is &0xf. + In LOONGARCH, if >=16, then the result is zero, otherwise it is that lane. + btranslated is the version that is converted from Intel to LOONGARCH. */ + SuperVector<16> btranslated = b & SuperVector<16>::dup_s8(0x8f); + return pshufb(btranslated); +} + +template<> +really_inline SuperVector<16> SuperVector<16>::pshufb_maskz(SuperVector<16> b, uint8_t const len) +{ + SuperVector mask = Ones_vshr(16 -len); + return mask & pshufb(b); +} + +#endif // SIMD_IMPL_HPP diff --git a/src/util/supervector/arch/loongarch64/types.hpp b/src/util/supervector/arch/loongarch64/types.hpp new file mode 100644 index 000000000..7e7210f1b --- /dev/null +++ b/src/util/supervector/arch/loongarch64/types.hpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2015-2017, Intel Corporation + * Copyright (c) 2020-2021, VectorCamp PC + * Copyright (c) 2023, Loongson Technology + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#if !defined(m128) && defined(HAVE_LSX) +typedef v4i32 m128; +#endif + diff --git a/src/util/supervector/supervector.hpp b/src/util/supervector/supervector.hpp index 253907fa3..b91db812b 100644 --- a/src/util/supervector/supervector.hpp +++ b/src/util/supervector/supervector.hpp @@ -43,6 +43,8 @@ #include "util/supervector/arch/arm/types.hpp" #elif defined(ARCH_PPC64EL) #include "util/supervector/arch/ppc64el/types.hpp" +#elif defined(ARCH_LOONGARCH64) +#include "util/supervector/arch/loongarch64/types.hpp" #endif #endif // VS_SIMDE_BACKEND @@ -66,6 +68,11 @@ using Z_TYPE = u64a; #define Z_BITS 64 #define Z_POSSHIFT 2 #define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l))) +#elif defined(ARCH_LOONGARCH64) +using Z_TYPE = u64a; +#define Z_BITS 64 +#define Z_POSSHIFT 2 +#define DOUBLE_LOAD_MASK(l) ((~0ULL) >> (Z_BITS - (l))) #else using Z_TYPE = u32; #define Z_BITS 32 @@ -190,6 +197,17 @@ class SuperVector : public BaseVector int8x16_t ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; #endif +#if defined(ARCH_LOONGARCH64) + v2u64 ALIGN_ATTR(BaseVector<16>::size) u64x2[SIZE / BaseVector<16>::size]; + v2i64 ALIGN_ATTR(BaseVector<16>::size) s64x2[SIZE / BaseVector<16>::size]; + v4u32 ALIGN_ATTR(BaseVector<16>::size) u32x4[SIZE / BaseVector<16>::size]; + v4i32 ALIGN_ATTR(BaseVector<16>::size) s32x4[SIZE / BaseVector<16>::size]; + v8u16 ALIGN_ATTR(BaseVector<16>::size) u16x8[SIZE / BaseVector<16>::size]; + v8i16 ALIGN_ATTR(BaseVector<16>::size) s16x8[SIZE / BaseVector<16>::size]; + v16u8 ALIGN_ATTR(BaseVector<16>::size) u8x16[SIZE / BaseVector<16>::size]; + v16i8 ALIGN_ATTR(BaseVector<16>::size) s8x16[SIZE / BaseVector<16>::size]; +#endif + uint64_t u64[SIZE / sizeof(uint64_t)]; int64_t s64[SIZE / sizeof(int64_t)]; uint32_t u32[SIZE / sizeof(uint32_t)]; @@ -395,6 +413,8 @@ struct Unroller #include "util/supervector/arch/arm/impl.cpp" #elif defined(ARCH_PPC64EL) #include "util/supervector/arch/ppc64el/impl.cpp" +#elif defined(ARCH_LOONGARCH64) +#include "util/supervector/arch/loongarch64/impl.cpp" #endif #endif #endif diff --git a/src/util/target_info.cpp b/src/util/target_info.cpp index 9bd343426..17e6ac26f 100644 --- a/src/util/target_info.cpp +++ b/src/util/target_info.cpp @@ -32,6 +32,7 @@ #include "util/arch/common/cpuid_flags.h" #if defined(ARCH_IA32) || defined(ARCH_X86_64) #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) +#elif defined(ARCH_LOONGARCH64) #endif namespace ue2 { diff --git a/unit/internal/simd_utils.cpp b/unit/internal/simd_utils.cpp index 272d5456d..f15fbf95a 100644 --- a/unit/internal/simd_utils.cpp +++ b/unit/internal/simd_utils.cpp @@ -673,6 +673,9 @@ TEST(SimdUtilsTest, movq) { #elif defined(ARCH_ARM32) || defined(ARCH_AARCH64) int64x2_t a = { 0x123456789abcdefLL, ~0LL }; simd = vreinterpretq_s32_s64(a); +#elif defined(ARCH_LOONGARCH64) + v2i64 a = { 0x123456789abcdefLL, ~0LL }; + simd = (m128) a; #elif defined(ARCH_PPC64EL) #if defined(__clang__) && (__clang_major__ >= 15) #pragma clang diagnostic push From 5ae6ca72750fb29c2a9a1db7f2ca8e86d6d7d3e6 Mon Sep 17 00:00:00 2001 From: Leslie Zhai Date: Thu, 14 Dec 2023 09:38:03 +0800 Subject: [PATCH 2/2] Revert cmake_minimum_required and PCRE_REQUIRED_MINOR_VERSION change. Move ARCH_C_FLAGS and ARCH_CXX_FLAGS to LoongArch respective cmake file. Move vpmax_loongarch to loongarch64/simd_utils.h. --- CMakeLists.txt | 9 ++------- cmake/cflags-loongarch64.cmake | 6 ++++-- src/util/arch/loongarch64/match.hpp | 18 ------------------ src/util/arch/loongarch64/simd_utils.h | 18 ++++++++++++++++++ 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 74997466e..85f96b53b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.13.4) +cmake_minimum_required (VERSION 3.18.4) project (vectorscan C CXX) @@ -162,11 +162,6 @@ foreach (CONFIG ${CMAKE_BUILD_TYPE} ${CMAKE_CONFIGURATION_TYPES}) string(REGEX REPLACE "-O[^ ]*" "" CMAKE_CXX_FLAGS_${CONFIG} "${CMAKE_CXX_FLAGS_${CONFIG}}") endforeach () -if (ARCH_LOONGARCH64) - set(ARCH_C_FLAGS "-mlsx") - set(ARCH_CXX_FLAGS "-mlsx") -endif(ARCH_LOONGARCH64) - message(STATUS "ARCH_C_FLAGS : ${ARCH_C_FLAGS}") message(STATUS "ARCH_CXX_FLAGS : ${ARCH_CXX_FLAGS}") @@ -193,7 +188,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${ARCH_CXX_FLAGS}") # PCRE check, we have a fixed requirement for PCRE to use Chimera # and hscollider set(PCRE_REQUIRED_MAJOR_VERSION 8) -set(PCRE_REQUIRED_MINOR_VERSION 39) +set(PCRE_REQUIRED_MINOR_VERSION 41) set(PCRE_REQUIRED_VERSION ${PCRE_REQUIRED_MAJOR_VERSION}.${PCRE_REQUIRED_MINOR_VERSION}) include (${CMAKE_MODULE_PATH}/pcre.cmake) if (NOT CORRECT_PCRE_VERSION) diff --git a/cmake/cflags-loongarch64.cmake b/cmake/cflags-loongarch64.cmake index 1af7312f3..9c26c312d 100644 --- a/cmake/cflags-loongarch64.cmake +++ b/cmake/cflags-loongarch64.cmake @@ -1,4 +1,3 @@ - CHECK_INCLUDE_FILE_CXX(lsxintrin.h HAVE_C_LOONGARCH64_LSXINTRIN_H) if (HAVE_C_LOONGARCH64_LSXINTRIN_H) @@ -7,7 +6,10 @@ else() message (FATAL_ERROR "No intrinsics header found for LSX") endif () -set(CMAKE_REQUIRED_FLAGS "-mlsx") +set(ARCH_C_FLAGS "-mlsx") +set(ARCH_CXX_FLAGS "-mlsx") + +set(CMAKE_REQUIRED_FLAGS "${ARCH_C_FLAGS}") CHECK_C_SOURCE_COMPILES("#include <${INTRIN_INC_H}> int main() { __m128i a = __lsx_vreplgr2vr_w(1); diff --git a/src/util/arch/loongarch64/match.hpp b/src/util/arch/loongarch64/match.hpp index 78651edc9..2b10c1a21 100644 --- a/src/util/arch/loongarch64/match.hpp +++ b/src/util/arch/loongarch64/match.hpp @@ -28,24 +28,6 @@ * POSSIBILITY OF SUCH DAMAGE. */ -static really_inline m128 vpmax_loongarch(v4u32 a, v4u32 b) { - u32 result[4]; - u32 tmp1 = __lsx_vpickve2gr_wu(a, 0); - u32 tmp2 = __lsx_vpickve2gr_wu(a, 1); - result[0] = (tmp1 >= tmp2) ? tmp1 : tmp2; - tmp1 = __lsx_vpickve2gr_wu(a, 2); - tmp2 = __lsx_vpickve2gr_wu(a, 3); - result[1] = (tmp1 >= tmp2) ? tmp1 : tmp2; - tmp1 = __lsx_vpickve2gr_wu(b, 0); - tmp2 = __lsx_vpickve2gr_wu(b, 1); - result[2] = (tmp1 >= tmp2) ? tmp1 : tmp2; - tmp1 = __lsx_vpickve2gr_wu(b, 2); - tmp2 = __lsx_vpickve2gr_wu(b, 3); - result[3] = (tmp1 >= tmp2) ? tmp1 : tmp2; - v4u32 res = __lsx_vld((uint32_t *)result, 0); - return res; -} - template <> really_really_inline const u8 *first_non_zero_match<16>(const u8 *buf, SuperVector<16> mask, u16 const UNUSED len) { diff --git a/src/util/arch/loongarch64/simd_utils.h b/src/util/arch/loongarch64/simd_utils.h index 9a207d366..58f5b387f 100644 --- a/src/util/arch/loongarch64/simd_utils.h +++ b/src/util/arch/loongarch64/simd_utils.h @@ -45,6 +45,24 @@ #include // for memcpy +static really_inline m128 vpmax_loongarch(v4u32 a, v4u32 b) { + u32 result[4]; + u32 tmp1 = __lsx_vpickve2gr_wu(a, 0); + u32 tmp2 = __lsx_vpickve2gr_wu(a, 1); + result[0] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(a, 2); + tmp2 = __lsx_vpickve2gr_wu(a, 3); + result[1] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(b, 0); + tmp2 = __lsx_vpickve2gr_wu(b, 1); + result[2] = (tmp1 >= tmp2) ? tmp1 : tmp2; + tmp1 = __lsx_vpickve2gr_wu(b, 2); + tmp2 = __lsx_vpickve2gr_wu(b, 3); + result[3] = (tmp1 >= tmp2) ? tmp1 : tmp2; + v4u32 res = __lsx_vld((uint32_t *)result, 0); + return res; +} + static really_inline m128 ones128(void) { return __lsx_vreplgr2vr_b(0xFF); }