diff --git a/CMakeLists.txt b/CMakeLists.txt index 0fe939df6..f73b6bd0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,6 +7,8 @@ set(VERSION "1.2.11") option(ASM686 "Enable building i686 assembly implementation") option(AMD64 "Enable building amd64 assembly implementation") +option(ARMv8 "Enable building ARM NEON intrinsics implementation") +option(ARMv8CRC "Enable building ARM CRC32 instruction") set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables") set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries") @@ -132,14 +134,39 @@ endif() if(CMAKE_COMPILER_IS_GNUCC) if(ASM686) set(ZLIB_ASMS contrib/asm686/match.S) - elseif (AMD64) + elseif(AMD64) set(ZLIB_ASMS contrib/amd64/amd64-match.S) - endif () + endif() - if(ZLIB_ASMS) - add_definitions(-DASMV) - set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) - endif() + if(ARMv8) + set(ZLIB_ARMv8 contrib/arm/neon_adler32.c) + endif() + if(ARMv8CRC) + set(ZLIB_ARMv8CRC contrib/arm/armv8_crc32.c) + endif() + + if(ZLIB_ASMS) + add_definitions(-DASMV) + set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE) + elseif(ZLIB_ARMv8 OR ZLIB_ARMv8CRC) + if(ZLIB_ARMv8) + add_definitions(-DARMv8) + set(COMPILER ${CMAKE_C_COMPILER}) + # NEON is mandatory in ARMv8. + if(${COMPILER} MATCHES "aarch64") + set_source_files_properties(${ZLIB_ARMv8} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a) + # But it was optional for ARMv7. + elseif(${COMPILER} MATCHES "arm") + set_source_files_properties(${ZLIB_ARMv8} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon) + endif() + endif() + + if(ZLIB_ARMv8CRC) + add_definitions(-DARMv8CRC) + set(COMPILER ${CMAKE_C_COMPILER}) + set_source_files_properties(${ZLIB_ARMv8CRC} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a+crc) + endif() + endif() endif() if(MSVC) @@ -183,8 +210,8 @@ if(MINGW) set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj) endif(MINGW) -add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) -add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARMv8} ${ZLIB_ARMv8CRC} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) +add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARMv8} ${ZLIB_ARMv8CRC} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS}) set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL) set_target_properties(zlib PROPERTIES SOVERSION 1) diff --git a/adler32.c b/adler32.c index d0be4380a..45ebaa4b2 100644 --- a/adler32.c +++ b/adler32.c @@ -136,7 +136,12 @@ uLong ZEXPORT adler32(adler, buf, len) const Bytef *buf; uInt len; { +#ifdef ARMv8 +# pragma message("Using NEON-ized Adler32.") + return NEON_adler32(adler, buf, len); +#else return adler32_z(adler, buf, len); +#endif } /* ========================================================================= */ diff --git a/contrib/README.contrib b/contrib/README.contrib index a411d5c39..3fd1d202c 100644 --- a/contrib/README.contrib +++ b/contrib/README.contrib @@ -12,6 +12,9 @@ amd64/ by Mikhail Teterin asm code for AMD64 See patch at http://www.freebsd.org/cgi/query-pr.cgi?pr=bin/96393 +arm/ by Adenilson Cavalcanti + ARM optimizations (NEON and ARMv8 code). + asm686/ by Brian Raiter asm code for Pentium and PPro/PII, using the AT&T (GNU as) syntax See http://www.muppetlabs.com/~breadbox/software/assembly.html diff --git a/contrib/arm/armv8_crc32.c b/contrib/arm/armv8_crc32.c new file mode 100644 index 000000000..1781414d1 --- /dev/null +++ b/contrib/arm/armv8_crc32.c @@ -0,0 +1,67 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Authors: Adenilson Cavalcanti + * Yang Zhang + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ +#include +// Depending on the compiler flavor, size_t may be defined in +// one or the other header. See: +// http://stackoverflow.com/questions/26410466/gcc-linaro-compiler-throws-error-unknown-type-name-size-t +#include +#include + +uint32_t armv8_crc32_little(uint32_t crc, + const unsigned char *buf, + size_t len) { + uint32_t c; + const uint32_t *buf4; + + c = crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = __crc32b(c, *buf++); + len--; + } + + buf4 = (const uint32_t *)(const void *)buf; + + while (len >= 32) { + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + c = __crc32w(c, *buf4++); + len -= 32; + } + + while (len >= 4) { + c = __crc32w(c, *buf4++); + len -= 4; + } + + buf = (const unsigned char *)buf4; + if (len) { + do { + c = __crc32b(c, *buf++); + } while (--len); + } + + c = ~c; + return c; +} diff --git a/contrib/arm/neon_adler32.c b/contrib/arm/neon_adler32.c new file mode 100644 index 000000000..f173a74f1 --- /dev/null +++ b/contrib/arm/neon_adler32.c @@ -0,0 +1,137 @@ +/* Copyright (C) 1995-2011, 2016 Mark Adler + * Copyright (C) 2017 ARM Holdings Inc. + * Authors: Adenilson Cavalcanti + * Simon Hosie + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + */ + +#if (defined(__ARM_NEON__) || defined(__ARM_NEON)) +#include + +static void NEON_accum32(uint32_t *s, const unsigned char *buf, + unsigned int len) +{ + static const uint8_t taps[32] = { + 32, 31, 30, 29, 28, 27, 26, 25, + 24, 23, 22, 21, 20, 19, 18, 17, + 16, 15, 14, 13, 12, 11, 10, 9, + 8, 7, 6, 5, 4, 3, 2, 1 }; + + uint32x2_t adacc2, s2acc2, as; + uint8x16_t t0 = vld1q_u8(taps), t1 = vld1q_u8(taps + 16); + + uint32x4_t adacc = vdupq_n_u32(0), s2acc = vdupq_n_u32(0); + adacc = vsetq_lane_u32(s[0], adacc, 0); + s2acc = vsetq_lane_u32(s[1], s2acc, 0); + + while (len >= 2) { + uint8x16_t d0 = vld1q_u8(buf), d1 = vld1q_u8(buf + 16); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 5)); + adler = vpaddlq_u8( d0); + adler = vpadalq_u8(adler, d1); + sum2 = vmull_u8( vget_low_u8(t0), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t0), vget_high_u8(d0)); + sum2 = vmlal_u8(sum2, vget_low_u8(t1), vget_low_u8(d1)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d1)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + len -= 2; + buf += 32; + } + + while (len > 0) { + uint8x16_t d0 = vld1q_u8(buf); + uint16x8_t adler, sum2; + s2acc = vaddq_u32(s2acc, vshlq_n_u32(adacc, 4)); + adler = vpaddlq_u8(d0); + sum2 = vmull_u8( vget_low_u8(t1), vget_low_u8(d0)); + sum2 = vmlal_u8(sum2, vget_high_u8(t1), vget_high_u8(d0)); + adacc = vpadalq_u16(adacc, adler); + s2acc = vpadalq_u16(s2acc, sum2); + buf += 16; + len--; + } + + adacc2 = vpadd_u32(vget_low_u32(adacc), vget_high_u32(adacc)); + s2acc2 = vpadd_u32(vget_low_u32(s2acc), vget_high_u32(s2acc)); + as = vpadd_u32(adacc2, s2acc2); + s[0] = vget_lane_u32(as, 0); + s[1] = vget_lane_u32(as, 1); +} + +static void NEON_handle_tail(uint32_t *pair, const unsigned char *buf, + unsigned int len) +{ + /* Oldie K&R code integration. */ + unsigned int i; + for (i = 0; i < len; ++i) { + pair[0] += buf[i]; + pair[1] += pair[0]; + } +} + +extern unsigned long NEON_adler32(unsigned long adler, const unsigned char *buf, + const unsigned int len) +{ + /* initial Adler-32 value (deferred check for len == 1 speed) */ + if (!buf) + return 1L; + + /* The largest prime smaller than 65536. */ + const uint32_t M_BASE = 65521; + /* This is the threshold where doing accumulation may overflow. */ + const int M_NMAX = 5552; + + unsigned long sum2; + uint32_t pair[2]; + int n = M_NMAX; + unsigned int done = 0; + /* Oldie K&R code integration. */ + unsigned int i; + + /* Split Adler-32 into component sums, it can be supplied by + * the caller sites (e.g. in a PNG file). + */ + sum2 = (adler >> 16) & 0xffff; + adler &= 0xffff; + pair[0] = adler; + pair[1] = sum2; + + for (i = 0; i < len; i += n) { + if ((i + n) > len) + n = len - i; + + if (n < 16) + break; + + NEON_accum32(pair, buf + i, n / 16); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + + done += (n / 16) * 16; + } + + /* Handle the tail elements. */ + if (done < len) { + NEON_handle_tail(pair, (buf + done), len - done); + pair[0] %= M_BASE; + pair[1] %= M_BASE; + } + + /* D = B * 65536 + A, see: https://en.wikipedia.org/wiki/Adler-32. */ + return (pair[1] << 16) | pair[0]; +} +#endif diff --git a/crc32.c b/crc32.c index 9580440c0..2629341ef 100644 --- a/crc32.c +++ b/crc32.c @@ -239,7 +239,12 @@ unsigned long ZEXPORT crc32(crc, buf, len) const unsigned char FAR *buf; uInt len; { +#ifdef ARMv8CRC +# pragma message("Using ARMv8 CRC32 instruction.") + return armv8_crc32_little(crc, buf, len); +#else return crc32_z(crc, buf, len); +#endif } #ifdef BYFOUR