Skip to content

Commit

Permalink
deflate: add new deflate_quick strategy for level 1
Browse files Browse the repository at this point in the history
The deflate_quick strategy is designed to provide maximum
deflate performance.

deflate_quick achieves this through:
- only checking the first hash match
- using a small inline SSE4.2-optimized longest_match
- forcing a window size of 8K, and using a precomputed dist/len
  table
- forcing the static Huffman tree and emitting codes immediately
  instead of tallying

This patch changes the scope of flush_pending, bi_windup, and
static_ltree to ZLIB_INTERNAL and moves END_BLOCK, send_code,
put_short, and send_bits to deflate.h.

Updates the configure script to enable by default for x86. On systems
without SSE4.2, fallback is to deflate_fast strategy.

Fixes madler#6
Fixes madler#8
  • Loading branch information
jtkukunas authored and GrabYourPitchforks committed Jul 28, 2022
1 parent cfa85f9 commit ac43ebd
Show file tree
Hide file tree
Showing 7 changed files with 2,440 additions and 99 deletions.
12 changes: 10 additions & 2 deletions Makefile.in
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,11 @@ SRCDIR=
ZINC=
ZINCOUT=-I.

OBJZ = adler32.o crc32.o crc_folding.o deflate.o infback.o inffast.o inflate.o inftrees.o slide_sse.o trees.o x86.o zutil.o
OBJZ = adler32.o crc32.o crc_folding.o deflate_quick.o deflate.o infback.o inffast.o inflate.o inftrees.o slide_sse.o trees.o x86.o zutil.o
OBJG = compress.o uncompr.o gzclose.o gzlib.o gzread.o gzwrite.o
OBJC = $(OBJZ) $(OBJG)

PIC_OBJZ = adler32.lo crc32.lo crc_folding.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo slide_sse.lo trees.lo x86.lo zutil.lo
PIC_OBJZ = adler32.lo crc32.lo crc_folding.lo deflate_quick.lo deflate.lo infback.lo inffast.lo inflate.lo inftrees.lo slide_sse.lo trees.lo x86.lo zutil.lo
PIC_OBJG = compress.lo uncompr.lo gzclose.lo gzlib.lo gzread.lo gzwrite.lo
PIC_OBJC = $(PIC_OBJZ) $(PIC_OBJG)

Expand Down Expand Up @@ -210,6 +210,9 @@ slide_sse.o: $(SRCDIR)slide_sse.c
crc_folding.o: $(SRCDIR)crc_folding.c
$(CC) $(CFLAGS) $(ZINC) -mpclmul -msse4 -c -o $@ $(SRCDIR)crc_folding.c

deflate_quick.o: $(SRCDIR)deflate_quick.c
$(CC) $(CFLAGS) $(ZINC) -msse4 -c -o $@ $(SRCDIR)deflate_quick.c

adler32.lo: $(SRCDIR)adler32.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -DPIC -c -o objs/adler32.o $(SRCDIR)adler32.c
Expand Down Expand Up @@ -300,6 +303,11 @@ crc_folding.lo: $(SRCDIR)crc_folding.c
$(CC) $(SFLAGS) $(ZINC) -mpclmul -msse4 -DPIC -c -o objs/crc_folding.o $(SRCDIR)crc_folding.c
-@mv objs/crc_folding.o $@

deflate_quick.lo: $(SRCDIR)deflate_quick.c
-@mkdir objs 2>/dev/null || test -d objs
$(CC) $(SFLAGS) $(ZINC) -msse4 -DPIC -c -o objs/deflate_quick.o $(SRCDIR)deflate_quick.c
-@mv objs/deflate_quick.o $@

placebo $(SHAREDLIBV): $(PIC_OBJS) libz.a
$(LDSHARED) $(SFLAGS) -o $@ $(PIC_OBJS) $(LDSHAREDLIBC) $(LDFLAGS)
rm -f $(SHAREDLIB) $(SHAREDLIBM)
Expand Down
43 changes: 27 additions & 16 deletions deflate.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,6 @@ const char deflate_copyright[] =
/* ===========================================================================
* Function prototypes.
*/
typedef enum {
need_more, /* block not completed, need more input or more output */
block_done, /* block flush performed */
finish_started, /* finish started, need only more output at next deflate */
finish_done /* finish done, accept no more input or output */
} block_state;

typedef block_state (*compress_func) OF((deflate_state *s, int flush));
/* Compression function. Returns the block state after the call. */

Expand All @@ -79,17 +72,18 @@ local void slide_hash_c OF((deflate_state *s));
#ifdef USE_SSE_SLIDE
extern void slide_hash_sse(deflate_state *s);
#endif
local void fill_window OF((deflate_state *s));
local block_state deflate_stored OF((deflate_state *s, int flush));
local block_state deflate_fast OF((deflate_state *s, int flush));
#ifndef FASTEST
local block_state deflate_slow OF((deflate_state *s, int flush));
#endif
#ifdef USE_QUICK
block_state deflate_quick OF((deflate_state *s, int flush));
#endif
local block_state deflate_rle OF((deflate_state *s, int flush));
local block_state deflate_huff OF((deflate_state *s, int flush));
local void lm_init OF((deflate_state *s));
local void putShortMSB OF((deflate_state *s, uInt b));
local void flush_pending OF((z_streamp strm));
local unsigned read_buf OF((z_streamp strm, Bytef *buf, unsigned size));
#ifdef ASMV
# pragma message("Assembler code may have bugs -- use at your own risk")
Expand Down Expand Up @@ -138,10 +132,15 @@ local const config configuration_table[2] = {
local const config configuration_table[10] = {
/* good lazy nice chain */
/* 0 */ {0, 0, 0, 0, deflate_stored}, /* store only */
#ifdef USE_QUICK
/* 1 */ {4, 4, 8, 4, deflate_quick},
/* 1 */ {4, 4, 8, 4, deflate_fast},
/* 3 */ {4, 6, 32, 32, deflate_fast},
#else
/* 1 */ {4, 4, 8, 4, deflate_fast}, /* max speed, no lazy matches */
/* 2 */ {4, 5, 16, 8, deflate_fast},
/* 3 */ {4, 6, 32, 32, deflate_fast},

#endif
/* 4 */ {4, 4, 16, 16, deflate_slow}, /* lazy matches */
/* 5 */ {8, 16, 32, 32, deflate_slow},
/* 6 */ {8, 16, 128, 128, deflate_slow},
Expand Down Expand Up @@ -286,6 +285,12 @@ int ZEXPORT deflateInit2_(strm, level, method, windowBits, memLevel, strategy,
return Z_STREAM_ERROR;
}
if (windowBits == 8) windowBits = 9; /* until 256-byte window bug fixed */

#ifdef USE_QUICK
if (level == 1)
windowBits = 13;
#endif

s = (deflate_state *) ZALLOC(strm, 1, sizeof(deflate_state));
if (s == Z_NULL) return Z_MEM_ERROR;
strm->state = (struct internal_state FAR *)s;
Expand Down Expand Up @@ -765,7 +770,7 @@ local void putShortMSB (s, b)
* applications may wish to modify it to avoid allocating a large
* strm->next_out buffer and copying into it. (See also read_buf()).
*/
local void flush_pending(strm)
ZLIB_INTERNAL void flush_pending(strm)
z_streamp strm;
{
unsigned len;
Expand Down Expand Up @@ -1035,10 +1040,16 @@ int ZEXPORT deflate (strm, flush)
(flush != Z_NO_FLUSH && s->status != FINISH_STATE)) {
block_state bstate;

bstate = s->level == 0 ? deflate_stored(s, flush) :
s->strategy == Z_HUFFMAN_ONLY ? deflate_huff(s, flush) :
s->strategy == Z_RLE ? deflate_rle(s, flush) :
(*(configuration_table[s->level].func))(s, flush);
if (s->level == 0)
bstate = deflate_stored(s, flush);
else if (s->strategy == Z_HUFFMAN_ONLY)
bstate = deflate_huff(s, flush);
else if (s->strategy == Z_RLE)
bstate = deflate_rle(s, flush);
else if (s->level == 1 && !x86_cpu_has_sse42)
bstate = deflate_fast(s, flush);
else
bstate = (*configuration_table[s->level].func)(s, flush);

if (bstate == finish_started || bstate == finish_done) {
s->status = FINISH_STATE;
Expand Down Expand Up @@ -1528,7 +1539,7 @@ local void check_match(s, start, match, length)
* performed for at least two bytes (required for the zip translate_eol
* option -- not supported here).
*/
local void fill_window(s)
ZLIB_INTERNAL void fill_window(s)
deflate_state *s;
{
unsigned n;
Expand Down
10 changes: 10 additions & 0 deletions deflate.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,9 @@
#define Buf_size 16
/* size of bit buffer in bi_buf */

#define END_BLOCK 256
/* end of block literal code */

#define INIT_STATE 42 /* zlib header -> BUSY_STATE */
#ifdef GZIP
# define GZIP_STATE 57 /* gzip header -> BUSY_STATE | EXTRA_STATE */
Expand All @@ -63,6 +66,12 @@
#define FINISH_STATE 666 /* stream complete */
/* Stream status */

typedef enum {
need_more, /* block not completed, need more input or more output */
block_done, /* block flush performed */
finish_started, /* finish started, need only more output at next deflate */
finish_done /* finish done, accept no more input or output */
} block_state;

/* Data structure describing a single value and its code string. */
typedef struct ct_data_s {
Expand Down Expand Up @@ -303,6 +312,7 @@ void ZLIB_INTERNAL _tr_flush_bits OF((deflate_state *s));
void ZLIB_INTERNAL _tr_align OF((deflate_state *s));
void ZLIB_INTERNAL _tr_stored_block OF((deflate_state *s, charf *buf,
ulg stored_len, int last));
void ZLIB_INTERNAL bi_windup OF((deflate_state *s));

#define d_code(dist) \
((dist) < 256 ? _dist_code[dist] : _dist_code[256+((dist)>>7)])
Expand Down
Loading

0 comments on commit ac43ebd

Please sign in to comment.