Skip to content

Commit

Permalink
deflate_compress: automatically select minimum match length
Browse files Browse the repository at this point in the history
In the greedy and lazy compressors, automatically increase the minimum
match length from the default of 3 if the data doesn't contain many
different literals.  This greatly improves the compression ratio of
levels 1-9 on certain types of data, such as DNA sequencing data, while
not worsening the ratio on other types of data.

The near-optimal compressor (used by compression levels 10-12) continues
to use a minimum match length of 3, since it already did a better job at
deciding when short matches are worthwhile.  (The method for setting the
initial costs needs improvement; later commits address that.)

Resolves #57
  • Loading branch information
ebiggers committed Dec 31, 2021
1 parent ddb2a5e commit 4019874
Showing 1 changed file with 123 additions and 5 deletions.
128 changes: 123 additions & 5 deletions lib/deflate_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -1990,6 +1990,101 @@ adjust_max_and_nice_len(unsigned *max_len, unsigned *nice_len, size_t remaining)
}
}

/*
* Choose the minimum match length for the greedy and lazy parsers.
*
* By default the minimum match length is 3, which is the smallest length the
* DEFLATE format allows. However, with greedy and lazy parsing, some data
* (e.g. DNA sequencing data) benefits greatly from a longer minimum length.
* Typically, this is because literals are very cheap. In general, the
* near-optimal parser handles this case naturally, but the greedy and lazy
* parsers need a heuristic to decide when to use short matches.
*
* The heuristic we use is to make the minimum match length depend on the number
* of different literals that exist in the data. If there are many different
* literals, then literals will probably be expensive, so short matches will
* probably be worthwhile. Conversely, if not many literals are used, then
* probably literals will be cheap and short matches won't be worthwhile.
*/
static unsigned
choose_min_match_len(unsigned num_used_literals, unsigned max_search_depth)
{
/* map from num_used_literals to min_len */
static const u8 min_lens[] = {
9, 9, 9, 9, 9, 9, 8, 8, 7, 7, 6, 6, 6, 6, 6, 6,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
/* the rest is implicitly 3 */
};
unsigned min_len;

STATIC_ASSERT(DEFLATE_MIN_MATCH_LEN <= 3);
STATIC_ASSERT(ARRAY_LEN(min_lens) <= DEFLATE_NUM_LITERALS + 1);

if (num_used_literals >= ARRAY_LEN(min_lens))
return 3;
min_len = min_lens[num_used_literals];
/*
* With a low max_search_depth, it may be too hard to find long matches.
*/
if (max_search_depth < 16) {
if (max_search_depth < 5)
min_len = MIN(min_len, 4);
else if (max_search_depth < 10)
min_len = MIN(min_len, 5);
else
min_len = MIN(min_len, 7);
}
return min_len;
}

static unsigned
calculate_min_match_len(const u8 *data, size_t data_len,
unsigned max_search_depth)
{
u8 used[256] = { 0 };
unsigned num_used_literals = 0;
int i;

/*
* For an initial approximation, scan the first 4 KiB of data.
* recalculate_min_match_len() will update the min_len later.
*/
data_len = MIN(data_len, 4096);
for (i = 0; i < data_len; i++)
used[data[i]] = 1;
for (i = 0; i < 256; i++)
num_used_literals += used[i];
return choose_min_match_len(num_used_literals, max_search_depth);
}

/*
* Recalculate the minimum match length for a block, now that we know the
* distribution of literals that are actually being used (freqs->litlen).
*/
static unsigned
recalculate_min_match_len(const struct deflate_freqs *freqs,
unsigned max_search_depth)
{
u32 literal_freq = 0;
u32 cutoff;
unsigned num_used_literals = 0;
int i;

for (i = 0; i < DEFLATE_NUM_LITERALS; i++)
literal_freq += freqs->litlen[i];

cutoff = literal_freq >> 10; /* Ignore literals used very rarely */

for (i = 0; i < DEFLATE_NUM_LITERALS; i++) {
if (freqs->litlen[i] > cutoff)
num_used_literals++;
}
return choose_min_match_len(num_used_literals, max_search_depth);
}

/*
* This is the level 0 "compressor". It always outputs uncompressed blocks.
*/
Expand Down Expand Up @@ -2032,11 +2127,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
const u8 * const in_block_begin = in_next;
const u8 * const in_max_block_end =
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
unsigned min_len;
u32 litrunlen = 0;
struct deflate_sequence *next_seq = c->p.g.sequences;

init_block_split_stats(&c->split_stats);
deflate_reset_symbol_frequencies(c);
min_len = calculate_min_match_len(in_next,
in_max_block_end - in_next,
c->max_search_depth);

do {
u32 length;
Expand All @@ -2048,15 +2147,15 @@ deflate_compress_greedy(struct libdeflate_compressor * restrict c,
&c->p.g.hc_mf,
&in_cur_base,
in_next,
DEFLATE_MIN_MATCH_LEN - 1,
min_len - 1,
max_len,
nice_len,
c->max_search_depth,
next_hashes,
&offset);

if (length > DEFLATE_MIN_MATCH_LEN ||
(length == DEFLATE_MIN_MATCH_LEN &&
if (length >= min_len &&
(length > DEFLATE_MIN_MATCH_LEN ||
offset <= 4096)) {
/* Match found. */
deflate_choose_match(c, length, offset,
Expand Down Expand Up @@ -2113,32 +2212,51 @@ deflate_compress_lazy_generic(struct libdeflate_compressor * restrict c,
const u8 * const in_block_begin = in_next;
const u8 * const in_max_block_end =
in_next + MIN(in_end - in_next, SOFT_MAX_BLOCK_LENGTH);
const u8 *next_recalc_min_len =
in_next + MIN(in_end - in_next, 10000);
unsigned min_len = DEFLATE_MIN_MATCH_LEN;
u32 litrunlen = 0;
struct deflate_sequence *next_seq = c->p.g.sequences;

init_block_split_stats(&c->split_stats);
deflate_reset_symbol_frequencies(c);

min_len = calculate_min_match_len(in_next,
in_max_block_end - in_next,
c->max_search_depth);
do {
unsigned cur_len;
unsigned cur_offset;
unsigned next_len;
unsigned next_offset;

/*
* Recalculate the minimum match length if it hasn't
* been done recently.
*/
if (in_next >= next_recalc_min_len) {
min_len = recalculate_min_match_len(
&c->freqs,
c->max_search_depth);
next_recalc_min_len +=
MIN(in_end - next_recalc_min_len,
in_next - in_block_begin);
}

/* Find the longest match at the current position. */
adjust_max_and_nice_len(&max_len, &nice_len,
in_end - in_next);
cur_len = hc_matchfinder_longest_match(
&c->p.g.hc_mf,
&in_cur_base,
in_next,
DEFLATE_MIN_MATCH_LEN - 1,
min_len - 1,
max_len,
nice_len,
c->max_search_depth,
next_hashes,
&cur_offset);
if (cur_len < DEFLATE_MIN_MATCH_LEN ||
if (cur_len < min_len ||
(cur_len == DEFLATE_MIN_MATCH_LEN &&
cur_offset > 8192)) {
/* No match found. Choose a literal. */
Expand Down

0 comments on commit 4019874

Please sign in to comment.