From bd3c6a231c471639c3f36e7859d85a7d5b2c39b5 Mon Sep 17 00:00:00 2001 From: "Node.js GitHub Bot" Date: Wed, 27 Mar 2024 21:53:28 +0200 Subject: [PATCH] deps: update zlib to 1.3.0.1-motley-24c07df PR-URL: https://github.com/nodejs/node/pull/52199 Reviewed-By: Marco Ippolito Reviewed-By: Luigi Pinca --- deps/zlib/CMakeLists.txt | 46 +++++++++++------ deps/zlib/adler32.c | 13 +++-- deps/zlib/adler32_simd.c | 104 +++++++++++++++++++++++++++++++++++++++ deps/zlib/cpu_features.c | 32 ++++++++++-- deps/zlib/cpu_features.h | 3 ++ deps/zlib/crc32.c | 6 ++- deps/zlib/deflate.c | 3 +- src/zlib_version.h | 2 +- 8 files changed, 183 insertions(+), 26 deletions(-) diff --git a/deps/zlib/CMakeLists.txt b/deps/zlib/CMakeLists.txt index 8389cdd6c38faa..c3f424770d92ce 100644 --- a/deps/zlib/CMakeLists.txt +++ b/deps/zlib/CMakeLists.txt @@ -74,6 +74,16 @@ if (ENABLE_SIMD_OPTIMIZATIONS) SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a+crc+crypto") endif() + + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + add_definitions(-DRISCV_RVV) + add_definitions(-DDEFLATE_SLIDE_HASH_RVV) + add_definitions(-DADLER32_SIMD_RVV) + #TODO(cavalcantii): add remaining flags as we port optimizations to RVV. + # Required by CPU features detection code. + SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv") + endif() + endif() # @@ -180,20 +190,28 @@ set(ZLIB_SRCS # Update list of source files if optimizations were enabled #============================================================================ if (ENABLE_SIMD_OPTIMIZATIONS) - list(REMOVE_ITEM ZLIB_SRCS inflate.c) - - list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h) - list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h) - list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h) - list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h) - list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h) - - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c) - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c) - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c) - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c) - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c) - list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c) + if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64") + message("RISCVV: Add optimizations.") + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c) + else() + list(REMOVE_ITEM ZLIB_SRCS inflate.c) + + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h) + list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.h) + + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc32_simd.c) + list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/crc_folding.c) + endif() endif() # parse the full version number from zlib.h and include in ZLIB_FULL_VERSION diff --git a/deps/zlib/adler32.c b/deps/zlib/adler32.c index 99a294496f7eb5..de78b4e56b038e 100644 --- a/deps/zlib/adler32.c +++ b/deps/zlib/adler32.c @@ -58,7 +58,7 @@ #endif #include "cpu_features.h" -#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) +#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) || defined(ADLER32_SIMD_RVV) #include "adler32_simd.h" #endif @@ -66,12 +66,16 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) { unsigned long sum2; unsigned n; - + /* TODO(cavalcantii): verify if this lengths are optimal for current CPUs. */ +#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \ + || defined(ADLER32_SIMD_RVV) #if defined(ADLER32_SIMD_SSSE3) if (buf != Z_NULL && len >= 64 && x86_cpu_enable_ssse3) - return adler32_simd_(adler, buf, len); #elif defined(ADLER32_SIMD_NEON) if (buf != Z_NULL && len >= 64) +#elif defined(ADLER32_SIMD_RVV) + if (buf != Z_NULL && len >= 32 && riscv_cpu_enable_rvv) +#endif return adler32_simd_(adler, buf, len); #endif @@ -90,7 +94,8 @@ uLong ZEXPORT adler32_z(uLong adler, const Bytef *buf, z_size_t len) { return adler | (sum2 << 16); } -#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) +#if defined(ADLER32_SIMD_SSSE3) || defined(ADLER32_SIMD_NEON) \ + || defined(RISCV_RVV) /* * Use SIMD to compute the adler32. Since this function can be * freely used, check CPU features here. zlib convention is to diff --git a/deps/zlib/adler32_simd.c b/deps/zlib/adler32_simd.c index 58966eecf0b800..9970ea9ca71857 100644 --- a/deps/zlib/adler32_simd.c +++ b/deps/zlib/adler32_simd.c @@ -41,6 +41,9 @@ * [2] zlib adler32_z() uses this fact to implement NMAX-block-based updates * of the adler s1 s2 of uint32_t type (see adler32.c). */ +/* Copyright (C) 2023 SiFive, Inc. All rights reserved. + * For conditions of distribution and use, see copyright notice in zlib.h + */ #include "adler32_simd.h" @@ -363,4 +366,105 @@ uint32_t ZLIB_INTERNAL adler32_simd_( /* NEON */ return s1 | (s2 << 16); } +#elif defined(ADLER32_SIMD_RVV) +#include +/* adler32_rvv.c - RVV version of Adler-32 + * RVV 1.0 code contributed by Alex Chiang + * on https://github.com/zlib-ng/zlib-ng/pull/1532 + * Port from Simon Hosie's fork: + * https://github.com/cloudflare/zlib/commit/40688b53c61cb9bfc36471acd2dc0800b7ebcab1 + */ + +uint32_t ZLIB_INTERNAL adler32_simd_( /* RVV */ + uint32_t adler, + const unsigned char *buf, + unsigned long len) +{ + /* split Adler-32 into component sums */ + uint32_t sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + + size_t left = len; + size_t vl = __riscv_vsetvlmax_e8m1(); + vl = vl > 256 ? 256 : vl; + vuint32m4_t v_buf32_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint32m4_t v_adler32_prev_accu = __riscv_vmv_v_x_u32m4(0, vl); + vuint16m2_t v_buf16_accu; + + /* + * We accumulate 8-bit data, and to prevent overflow, we have to use a 32-bit accumulator. + * However, adding 8-bit data into a 32-bit accumulator isn't efficient. We use 16-bit & 32-bit + * accumulators to boost performance. + * + * The block_size is the largest multiple of vl that <= 256, because overflow would occur when + * vl > 256 (255 * 256 <= UINT16_MAX). + * + * We accumulate 8-bit data into a 16-bit accumulator and then + * move the data into the 32-bit accumulator at the last iteration. + */ + size_t block_size = (256 / vl) * vl; + size_t nmax_limit = (NMAX / block_size); + size_t cnt = 0; + while (left >= block_size) { + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t subprob = block_size; + while (subprob > 0) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + buf += vl; + subprob -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, block_size / vl, v_buf32_accu, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + left -= block_size; + /* do modulo once each block of NMAX size */ + if (++cnt >= nmax_limit) { + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + cnt = 0; + } + } + /* the left len <= 256 now, we can use 16-bit accum safely */ + v_buf16_accu = __riscv_vmv_v_x_u16m2(0, vl); + size_t res = left; + while (left >= vl) { + vuint8m1_t v_buf8 = __riscv_vle8_v_u8m1(buf, vl); + v_adler32_prev_accu = __riscv_vwaddu_wv_u32m4(v_adler32_prev_accu, v_buf16_accu, vl); + v_buf16_accu = __riscv_vwaddu_wv_u16m2(v_buf16_accu, v_buf8, vl); + buf += vl; + left -= vl; + } + v_adler32_prev_accu = __riscv_vmacc_vx_u32m4(v_adler32_prev_accu, res / vl, v_buf32_accu, vl); + v_adler32_prev_accu = __riscv_vremu_vx_u32m4(v_adler32_prev_accu, BASE, vl); + v_buf32_accu = __riscv_vwaddu_wv_u32m4(v_buf32_accu, v_buf16_accu, vl); + + vuint32m4_t v_seq = __riscv_vid_v_u32m4(vl); + vuint32m4_t v_rev_seq = __riscv_vrsub_vx_u32m4(v_seq, vl, vl); + vuint32m4_t v_sum32_accu = __riscv_vmul_vv_u32m4(v_buf32_accu, v_rev_seq, vl); + + v_sum32_accu = __riscv_vadd_vv_u32m4(v_sum32_accu, __riscv_vmul_vx_u32m4(v_adler32_prev_accu, vl, vl), vl); + + vuint32m1_t v_sum2_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_sum2_sum = __riscv_vredsum_vs_u32m4_u32m1(v_sum32_accu, v_sum2_sum, vl); + uint32_t sum2_sum = __riscv_vmv_x_s_u32m1_u32(v_sum2_sum); + + sum2 += (sum2_sum + adler * (len - left)); + + vuint32m1_t v_adler_sum = __riscv_vmv_s_x_u32m1(0, vl); + v_adler_sum = __riscv_vredsum_vs_u32m4_u32m1(v_buf32_accu, v_adler_sum, vl); + uint32_t adler_sum = __riscv_vmv_x_s_u32m1_u32(v_adler_sum); + + adler += adler_sum; + + while (left--) { + adler += *buf++; + sum2 += adler; + } + + sum2 %= BASE; + adler %= BASE; + + return adler | (sum2 << 16); +} + #endif /* ADLER32_SIMD_SSSE3 */ diff --git a/deps/zlib/cpu_features.c b/deps/zlib/cpu_features.c index 64e0428cd2fc2d..34ae7b913af9a2 100644 --- a/deps/zlib/cpu_features.c +++ b/deps/zlib/cpu_features.c @@ -33,9 +33,13 @@ int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0; int ZLIB_INTERNAL x86_cpu_enable_simd = 0; int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0; +int ZLIB_INTERNAL riscv_cpu_enable_rvv = 0; +int ZLIB_INTERNAL riscv_cpu_enable_vclmul = 0; + #ifndef CPU_NO_SIMD -#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS) +#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \ + defined(ARMV8_OS_FUCHSIA) || defined(ARMV8_OS_IOS) #include #endif @@ -62,7 +66,10 @@ int ZLIB_INTERNAL x86_cpu_enable_avx512 = 0; static void _cpu_check_features(void); #endif -#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) +#if defined(ARMV8_OS_ANDROID) || defined(ARMV8_OS_LINUX) || \ + defined(ARMV8_OS_MACOS) || defined(ARMV8_OS_FUCHSIA) || \ + defined(X86_NOT_WINDOWS) || defined(ARMV8_OS_IOS) || \ + defined(RISCV_RVV) #if !defined(ARMV8_OS_MACOS) // _cpu_check_features() doesn't need to do anything on mac/arm since all // features are known at build time, so don't call it. @@ -184,6 +191,23 @@ static void _cpu_check_features(void) x86_cpu_enable_avx512 = _xgetbv(0) & 0x00000040; #endif } +#endif // x86 & NO_SIMD + +#elif defined(RISCV_RVV) +#include + +#ifndef ZLIB_HWCAP_RVV +#define ZLIB_HWCAP_RVV (1 << ('v' - 'a')) #endif -#endif -#endif + +/* TODO(cavalcantii) + * - add support for Android@RISCV i.e. __riscv_hwprobe(). + * - detect vclmul (crypto extensions). + */ +static void _cpu_check_features(void) +{ + unsigned long features = getauxval(AT_HWCAP); + riscv_cpu_enable_rvv = !!(features & ZLIB_HWCAP_RVV); +} +#endif // ARM | x86 | RISCV +#endif // NO SIMD CPU diff --git a/deps/zlib/cpu_features.h b/deps/zlib/cpu_features.h index aed3e834c5ac89..6092c7e852bda2 100644 --- a/deps/zlib/cpu_features.h +++ b/deps/zlib/cpu_features.h @@ -16,4 +16,7 @@ extern int x86_cpu_enable_ssse3; extern int x86_cpu_enable_simd; extern int x86_cpu_enable_avx512; +extern int riscv_cpu_enable_rvv; +extern int riscv_cpu_enable_vclmul; + void cpu_check_features(void); diff --git a/deps/zlib/crc32.c b/deps/zlib/crc32.c index cf8579f30aa707..32686f92488c51 100644 --- a/deps/zlib/crc32.c +++ b/deps/zlib/crc32.c @@ -706,7 +706,8 @@ unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf, * place to cache CPU features if needed for those later, more * interesting crc32() calls. */ -#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) +#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \ + || defined(RISCV_RVV) /* * Since this routine can be freely used, check CPU features here. */ @@ -1085,7 +1086,8 @@ unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf, /* Some bots compile with optimizations disabled, others will emulate * ARM on x86 and other weird combinations. */ -#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) +#if defined(CRC32_SIMD_SSE42_PCLMUL) || defined(CRC32_ARMV8_CRC32) \ + || defined(RISCV_RVV) /* We got to verify CPU features, so exploit the common usage pattern * of calling this function with Z_NULL for an initial valid crc value. * This allows to cache the result of the feature check and avoid extraneous diff --git a/deps/zlib/deflate.c b/deps/zlib/deflate.c index a67d195c5d46f2..b9a312030464c7 100644 --- a/deps/zlib/deflate.c +++ b/deps/zlib/deflate.c @@ -401,7 +401,8 @@ int ZEXPORT deflateInit2_(z_streamp strm, int level, int method, // for all wrapper formats (e.g. RAW, ZLIB, GZIP). // Feature detection is not triggered while using RAW mode (i.e. we never // call crc32() with a NULL buffer). -#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) +#if defined(CRC32_ARMV8_CRC32) || defined(CRC32_SIMD_SSE42_PCLMUL) \ + || defined(RISCV_RVV) cpu_check_features(); #endif diff --git a/src/zlib_version.h b/src/zlib_version.h index cf8fb2d37a4080..3a0040aad23dce 100644 --- a/src/zlib_version.h +++ b/src/zlib_version.h @@ -2,5 +2,5 @@ // Refer to tools/dep_updaters/update-zlib.sh #ifndef SRC_ZLIB_VERSION_H_ #define SRC_ZLIB_VERSION_H_ -#define ZLIB_VERSION "1.3.0.1-motley-24342f6" +#define ZLIB_VERSION "1.3.0.1-motley-24c07df" #endif // SRC_ZLIB_VERSION_H_