Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Inflate fast NEON optimization #345

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 18 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(VERSION "1.2.11")

option(ASM686 "Enable building i686 assembly implementation")
option(AMD64 "Enable building amd64 assembly implementation")
option(ARM_NEON "Enable building ARM NEON intrinsics implementation")

set(INSTALL_BIN_DIR "${CMAKE_INSTALL_PREFIX}/bin" CACHE PATH "Installation directory for executables")
set(INSTALL_LIB_DIR "${CMAKE_INSTALL_PREFIX}/lib" CACHE PATH "Installation directory for libraries")
Expand Down Expand Up @@ -136,6 +137,21 @@ if(CMAKE_COMPILER_IS_GNUCC)
set(ZLIB_ASMS contrib/amd64/amd64-match.S)
endif ()

if(ARM_NEON)
list(REMOVE_ITEM ZLIB_SRCS inflate.c)
set(ZLIB_ARM_NEON_HDRS contrib/arm/chunkcopy.h contrib/arm/inffast_chunk.h)
set(ZLIB_ARM_NEON contrib/arm/inflate.c contrib/arm/inffast_chunk.c)
add_definitions(-DARM_NEON)
set(COMPILER ${CMAKE_C_COMPILER})
# NEON is mandatory in ARMv8.
if(${COMPILER} MATCHES "aarch64")
set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -march=armv8-a)
# But it was optional for ARMv7.
elseif(${COMPILER} MATCHES "arm")
set_source_files_properties(${ZLIB_ARM_NEON} PROPERTIES LANGUAGE C COMPILE_FLAGS -mfpu=neon)
endif()
endif()

if(ZLIB_ASMS)
add_definitions(-DASMV)
set_source_files_properties(${ZLIB_ASMS} PROPERTIES LANGUAGE C COMPILE_FLAGS -DNO_UNDERLINE)
Expand Down Expand Up @@ -183,8 +199,8 @@ if(MINGW)
set(ZLIB_DLL_SRCS ${CMAKE_CURRENT_BINARY_DIR}/zlib1rc.obj)
endif(MINGW)

add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
add_library(zlib SHARED ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_DLL_SRCS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
add_library(zlibstatic STATIC ${ZLIB_SRCS} ${ZLIB_ASMS} ${ZLIB_ARM_NEON} ${ZLIB_ARM_NEON_HDRS} ${ZLIB_PUBLIC_HDRS} ${ZLIB_PRIVATE_HDRS})
set_target_properties(zlib PROPERTIES DEFINE_SYMBOL ZLIB_DLL)
set_target_properties(zlib PROPERTIES SOVERSION 1)

Expand Down
297 changes: 297 additions & 0 deletions contrib/arm/chunkcopy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,297 @@
/* chunkcopy.h -- fast copies and sets
* Copyright (C) 2017 ARM, Inc.
* For conditions of distribution and use, see copyright notice in zlib.h
*/

#ifndef CHUNKCOPY_H
#define CHUNKCOPY_H

#include <arm_neon.h>
#include "zutil.h"

#if __STDC_VERSION__ >= 199901L
#define Z_RESTRICT restrict
#else
#define Z_RESTRICT
#endif

typedef uint8x16_t chunkcopy_chunk_t;
#define CHUNKCOPY_CHUNK_SIZE sizeof(chunkcopy_chunk_t)

/*
Ask the compiler to perform a wide, unaligned load with an machine
instruction appropriate for the chunkcopy_chunk_t type.
*/
static inline chunkcopy_chunk_t loadchunk(const unsigned char FAR* s) {
chunkcopy_chunk_t c;
__builtin_memcpy(&c, s, sizeof(c));
return c;
}

/*
Ask the compiler to perform a wide, unaligned store with an machine
instruction appropriate for the chunkcopy_chunk_t type.
*/
static inline void storechunk(unsigned char FAR* d, chunkcopy_chunk_t c) {
__builtin_memcpy(d, &c, sizeof(c));
}

/*
Perform a memcpy-like operation, but assume that length is non-zero and that
it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
the length is shorter than this.

It also guarantees that it will properly unroll the data if the distance
between `out` and `from` is at least CHUNKCOPY_CHUNK_SIZE, which we rely on
in chunkcopy_relaxed().

Aside from better memory bus utilisation, this means that short copies
(CHUNKCOPY_CHUNK_SIZE bytes or fewer) will fall straight through the loop
without iteration, which will hopefully make the branch prediction more
reliable.
*/
static inline unsigned char FAR* chunkcopy_core(unsigned char FAR* out,
const unsigned char FAR* from,
unsigned len) {
int bump = (--len % CHUNKCOPY_CHUNK_SIZE) + 1;
storechunk(out, loadchunk(from));
out += bump;
from += bump;
len /= CHUNKCOPY_CHUNK_SIZE;
while (len-- > 0) {
storechunk(out, loadchunk(from));
out += CHUNKCOPY_CHUNK_SIZE;
from += CHUNKCOPY_CHUNK_SIZE;
}
return out;
}

/*
Like chunkcopy_core, but avoid writing beyond of legal output.

Accepts an additional pointer to the end of safe output. A generic safe
copy would use (out + len), but it's normally the case that the end of the
output buffer is beyond the end of the current copy, and this can still be
exploited.
*/
static inline unsigned char FAR* chunkcopy_core_safe(
unsigned char FAR* out,
const unsigned char FAR* from,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
if (limit - out < CHUNKCOPY_CHUNK_SIZE) {
const unsigned char FAR* Z_RESTRICT rfrom = from;
if (len & 8) {
__builtin_memcpy(out, rfrom, 8);
out += 8;
rfrom += 8;
}
if (len & 4) {
__builtin_memcpy(out, rfrom, 4);
out += 4;
rfrom += 4;
}
if (len & 2) {
__builtin_memcpy(out, rfrom, 2);
out += 2;
rfrom += 2;
}
if (len & 1) {
*out++ = *rfrom++;
}
return out;
}
return chunkcopy_core(out, from, len);
}

/*
Perform short copies until distance can be rewritten as being at least
CHUNKCOPY_CHUNK_SIZE.

This assumes that it's OK to overwrite at least the first
2*CHUNKCOPY_CHUNK_SIZE bytes of output even if the copy is shorter than
this. This assumption holds within inflate_fast() which starts every
iteration with at least 258 bytes of output space available (258 being the
maximum length output from a single token; see inffast.c).
*/
static inline unsigned char FAR* chunkunroll_relaxed(unsigned char FAR* out,
unsigned FAR* dist,
unsigned FAR* len) {
const unsigned char FAR* from = out - *dist;
while (*dist < *len && *dist < CHUNKCOPY_CHUNK_SIZE) {
storechunk(out, loadchunk(from));
out += *dist;
*len -= *dist;
*dist += *dist;
}
return out;
}

static inline uint8x16_t chunkset_vld1q_dup_u8x8(
const unsigned char FAR* Z_RESTRICT from) {
#if defined(__clang__) || defined(__aarch64__)
return vreinterpretq_u8_u64(vld1q_dup_u64((void*)from));
#else
/* 32-bit GCC uses an alignment hint for vld1q_dup_u64, even when given a
* void pointer, so here's an alternate implementation.
*/
uint8x8_t h = vld1_u8(from);
return vcombine_u8(h, h);
#endif
}

/*
Perform an overlapping copy which behaves as a memset() operation, but
supporting periods other than one, and assume that length is non-zero and
that it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE*3 bytes of output
even if the length is shorter than this.
*/
static inline unsigned char FAR* chunkset_core(unsigned char FAR* out,
unsigned period,
unsigned len) {
uint8x16_t f;
int bump = ((len - 1) % sizeof(f)) + 1;

switch (period) {
case 1:
f = vld1q_dup_u8(out - 1);
vst1q_u8(out, f);
out += bump;
len -= bump;
while (len > 0) {
vst1q_u8(out, f);
out += sizeof(f);
len -= sizeof(f);
}
return out;
case 2:
f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
vst1q_u8(out, f);
out += bump;
len -= bump;
if (len > 0) {
f = vreinterpretq_u8_u16(vld1q_dup_u16((void*)(out - 2)));
do {
vst1q_u8(out, f);
out += sizeof(f);
len -= sizeof(f);
} while (len > 0);
}
return out;
case 4:
f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
vst1q_u8(out, f);
out += bump;
len -= bump;
if (len > 0) {
f = vreinterpretq_u8_u32(vld1q_dup_u32((void*)(out - 4)));
do {
vst1q_u8(out, f);
out += sizeof(f);
len -= sizeof(f);
} while (len > 0);
}
return out;
case 8:
f = chunkset_vld1q_dup_u8x8(out - 8);
vst1q_u8(out, f);
out += bump;
len -= bump;
if (len > 0) {
f = chunkset_vld1q_dup_u8x8(out - 8);
do {
vst1q_u8(out, f);
out += sizeof(f);
len -= sizeof(f);
} while (len > 0);
}
return out;
}
out = chunkunroll_relaxed(out, &period, &len);
return chunkcopy_core(out, out - period, len);
}

/*
Perform a memcpy-like operation, but assume that length is non-zero and that
it's OK to overwrite at least CHUNKCOPY_CHUNK_SIZE bytes of output even if
the length is shorter than this.

Unlike chunkcopy_core() above, no guarantee is made regarding the behaviour
of overlapping buffers, regardless of the distance between the pointers.
This is reflected in the `restrict`-qualified pointers, allowing the
compiler to reorder loads and stores.
*/
static inline unsigned char FAR* chunkcopy_relaxed(
unsigned char FAR* Z_RESTRICT out,
const unsigned char FAR* Z_RESTRICT from,
unsigned len) {
return chunkcopy_core(out, from, len);
}

/*
Like chunkcopy_relaxed, but avoid writing beyond of legal output.

Unlike chunkcopy_core_safe() above, no guarantee is made regarding the
behaviour of overlapping buffers, regardless of the distance between the
pointers. This is reflected in the `restrict`-qualified pointers, allowing
the compiler to reorder loads and stores.

Accepts an additional pointer to the end of safe output. A generic safe
copy would use (out + len), but it's normally the case that the end of the
output buffer is beyond the end of the current copy, and this can still be
exploited.
*/
static inline unsigned char FAR* chunkcopy_safe(
unsigned char FAR* out,
const unsigned char FAR* Z_RESTRICT from,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
return chunkcopy_core_safe(out, from, len, limit);
}

/*
Perform chunky copy within the same buffer, where the source and destination
may potentially overlap.

Assumes that len > 0 on entry, and that it's safe to write at least
CHUNKCOPY_CHUNK_SIZE*3 bytes to the output.
*/
static inline unsigned char FAR*
chunkcopy_lapped_relaxed(unsigned char FAR* out, unsigned dist, unsigned len) {
if (dist < len && dist < CHUNKCOPY_CHUNK_SIZE) {
return chunkset_core(out, dist, len);
}
return chunkcopy_core(out, out - dist, len);
}

/*
Behave like chunkcopy_lapped_relaxed, but avoid writing beyond of legal
output.

Accepts an additional pointer to the end of safe output. A generic safe
copy would use (out + len), but it's normally the case that the end of the
output buffer is beyond the end of the current copy, and this can still be
exploited.
*/
static inline unsigned char FAR* chunkcopy_lapped_safe(
unsigned char FAR* out,
unsigned dist,
unsigned len,
unsigned char FAR* limit) {
Assert(out + len <= limit, "chunk copy exceeds safety limit");
if (limit - out < CHUNKCOPY_CHUNK_SIZE * 3) {
/* TODO(cavalcantii): try harder to optimise this */
while (len-- > 0) {
*out = *(out - dist);
out++;
}
return out;
}
return chunkcopy_lapped_relaxed(out, dist, len);
}

#undef Z_RESTRICT

#endif /* CHUNKCOPY_H */
Loading