Skip to content

Commit

Permalink
Pull Brotli code from Upstream (#20)
Browse files Browse the repository at this point in the history
* Pull Brotli code from upstream

* Update copyright year
  • Loading branch information
hyperxpro authored May 4, 2021
1 parent 490c492 commit 25ec20a
Show file tree
Hide file tree
Showing 32 changed files with 299 additions and 638 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Brotli4j provides Brotli compression and decompression for Java.
<dependency>
<groupId>com.aayushatharva.brotli4j</groupId>
<artifactId>brotli4j</artifactId>
<version>1.4.0</version>
<version>1.4.1</version>
</dependency>
```

Expand Down
432 changes: 0 additions & 432 deletions brotli/common/dictionary.bin

This file was deleted.

Binary file removed brotli/common/dictionary.bin.br
Binary file not shown.
13 changes: 11 additions & 2 deletions brotli/enc/block_splitter.c
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ static const double kCommandBlockSwitchCost = 13.5;
static const double kDistanceBlockSwitchCost = 14.6;
static const size_t kLiteralStrideLength = 70;
static const size_t kCommandStrideLength = 40;
static const size_t kDistanceStrideLength = 40;
static const size_t kSymbolsPerLiteralHistogram = 544;
static const size_t kSymbolsPerCommandHistogram = 530;
static const size_t kSymbolsPerDistanceHistogram = 544;
Expand Down Expand Up @@ -119,6 +120,8 @@ void BrotliDestroyBlockSplit(MemoryManager* m, BlockSplit* self) {
BROTLI_FREE(m, self->lengths);
}

/* Extracts literals, command distance and prefix codes, then applies
* SplitByteVector to create partitioning. */
void BrotliSplitBlock(MemoryManager* m,
const Command* cmds,
const size_t num_commands,
Expand All @@ -136,14 +139,20 @@ void BrotliSplitBlock(MemoryManager* m,
/* Create a continuous array of literals. */
CopyLiteralsToByteArray(cmds, num_commands, data, pos, mask, literals);
/* Create the block split on the array of literals.
Literal histograms have alphabet size 256. */
* Literal histograms can have alphabet size up to 256.
* Though, to accomodate context modeling, less than half of maximum size
* is allowed. */
SplitByteVectorLiteral(
m, literals, literals_count,
kSymbolsPerLiteralHistogram, kMaxLiteralHistograms,
kLiteralStrideLength, kLiteralBlockSwitchCost, params,
literal_split);
if (BROTLI_IS_OOM(m)) return;
BROTLI_FREE(m, literals);
/* NB: this might be a good place for injecting extra splitting without
* increasing encoder complexity; however, output parition would be less
* optimal than one produced with forced splitting inside
* SplitByteVector (FindBlocks / ClusterBlocks). */
}

{
Expand Down Expand Up @@ -181,7 +190,7 @@ void BrotliSplitBlock(MemoryManager* m,
SplitByteVectorDistance(
m, distance_prefixes, j,
kSymbolsPerDistanceHistogram, kMaxCommandHistograms,
kCommandStrideLength, kDistanceBlockSwitchCost, params,
kDistanceStrideLength, kDistanceBlockSwitchCost, params,
dist_split);
if (BROTLI_IS_OOM(m)) return;
BROTLI_FREE(m, distance_prefixes);
Expand Down
76 changes: 55 additions & 21 deletions brotli/enc/block_splitter_inc.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,46 +71,56 @@ static size_t FN(FindBlocks)(const DataType* data, const size_t length,
double* cost,
uint8_t* switch_signal,
uint8_t* block_id) {
const size_t data_size = FN(HistogramDataSize)();
const size_t bitmaplen = (num_histograms + 7) >> 3;
const size_t alphabet_size = FN(HistogramDataSize)();
const size_t bitmap_len = (num_histograms + 7) >> 3;
size_t num_blocks = 1;
size_t byte_ix;
size_t i;
size_t j;
BROTLI_DCHECK(num_histograms <= 256);

/* Trivial case: single historgram -> single block type. */
if (num_histograms <= 1) {
for (i = 0; i < length; ++i) {
block_id[i] = 0;
}
return 1;
}
memset(insert_cost, 0, sizeof(insert_cost[0]) * data_size * num_histograms);

/* Fill bitcost for each symbol of all histograms.
* Non-existing symbol cost: 2 + log2(total_count).
* Regular symbol cost: -log2(symbol_count / total_count). */
memset(insert_cost, 0,
sizeof(insert_cost[0]) * alphabet_size * num_histograms);
for (i = 0; i < num_histograms; ++i) {
insert_cost[i] = FastLog2((uint32_t)histograms[i].total_count_);
}
for (i = data_size; i != 0;) {
for (i = alphabet_size; i != 0;) {
/* Reverse order to use the 0-th row as a temporary storage. */
--i;
for (j = 0; j < num_histograms; ++j) {
insert_cost[i * num_histograms + j] =
insert_cost[j] - BitCost(histograms[j].data_[i]);
}
}
memset(cost, 0, sizeof(cost[0]) * num_histograms);
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmaplen);

/* After each iteration of this loop, cost[k] will contain the difference
between the minimum cost of arriving at the current byte position using
entropy code k, and the minimum cost of arriving at the current byte
position. This difference is capped at the block switch cost, and if it
reaches block switch cost, it means that when we trace back from the last
position, we need to switch here. */
for (i = 0; i < length; ++i) {
const size_t byte_ix = i;
size_t ix = byte_ix * bitmaplen;
size_t insert_cost_ix = data[byte_ix] * num_histograms;
memset(cost, 0, sizeof(cost[0]) * num_histograms);
memset(switch_signal, 0, sizeof(switch_signal[0]) * length * bitmap_len);
for (byte_ix = 0; byte_ix < length; ++byte_ix) {
size_t ix = byte_ix * bitmap_len;
size_t symbol = data[byte_ix];
size_t insert_cost_ix = symbol * num_histograms;
double min_cost = 1e99;
double block_switch_cost = block_switch_bitcost;
size_t k;
for (k = 0; k < num_histograms; ++k) {
/* We are coding the symbol in data[byte_ix] with entropy code k. */
/* We are coding the symbol with entropy code k. */
cost[k] += insert_cost[insert_cost_ix + k];
if (cost[k] < min_cost) {
min_cost = cost[k];
Expand All @@ -126,20 +136,21 @@ static size_t FN(FindBlocks)(const DataType* data, const size_t length,
if (cost[k] >= block_switch_cost) {
const uint8_t mask = (uint8_t)(1u << (k & 7));
cost[k] = block_switch_cost;
BROTLI_DCHECK((k >> 3) < bitmaplen);
BROTLI_DCHECK((k >> 3) < bitmap_len);
switch_signal[ix + (k >> 3)] |= mask;
}
}
}

byte_ix = length - 1;
{ /* Trace back from the last position and switch at the marked places. */
size_t byte_ix = length - 1;
size_t ix = byte_ix * bitmaplen;
size_t ix = byte_ix * bitmap_len;
uint8_t cur_id = block_id[byte_ix];
while (byte_ix > 0) {
const uint8_t mask = (uint8_t)(1u << (cur_id & 7));
BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmaplen);
BROTLI_DCHECK(((size_t)cur_id >> 3) < bitmap_len);
--byte_ix;
ix -= bitmaplen;
ix -= bitmap_len;
if (switch_signal[ix + (cur_id >> 3)] & mask) {
if (cur_id != block_id[byte_ix]) {
cur_id = block_id[byte_ix];
Expand Down Expand Up @@ -185,6 +196,8 @@ static void FN(BuildBlockHistograms)(const DataType* data, const size_t length,
}
}

/* Given the initial partitioning build partitioning with limited number
* of histograms (and block types). */
static void FN(ClusterBlocks)(MemoryManager* m,
const DataType* data, const size_t length,
const size_t num_blocks,
Expand Down Expand Up @@ -228,6 +241,7 @@ static void FN(ClusterBlocks)(MemoryManager* m,

memset(block_lengths, 0, num_blocks * sizeof(uint32_t));

/* Calculate block lengths (convert repeating values -> series length). */
{
size_t block_idx = 0;
for (i = 0; i < length; ++i) {
Expand All @@ -240,15 +254,17 @@ static void FN(ClusterBlocks)(MemoryManager* m,
BROTLI_DCHECK(block_idx == num_blocks);
}

/* Pre-cluster blocks (cluster batches). */
for (i = 0; i < num_blocks; i += HISTOGRAMS_PER_BATCH) {
const size_t num_to_combine =
BROTLI_MIN(size_t, num_blocks - i, HISTOGRAMS_PER_BATCH);
size_t num_new_clusters;
size_t j;
for (j = 0; j < num_to_combine; ++j) {
size_t k;
size_t block_length = block_lengths[i + j];
FN(HistogramClear)(&histograms[j]);
for (k = 0; k < block_lengths[i + j]; ++k) {
for (k = 0; k < block_length; ++k) {
FN(HistogramAdd)(&histograms[j], data[pos++]);
}
histograms[j].bit_cost_ = FN(BrotliPopulationCost)(&histograms[j]);
Expand Down Expand Up @@ -278,14 +294,14 @@ static void FN(ClusterBlocks)(MemoryManager* m,
}
BROTLI_FREE(m, histograms);

/* Final clustering. */
max_num_pairs =
BROTLI_MIN(size_t, 64 * num_clusters, (num_clusters / 2) * num_clusters);
if (pairs_capacity < max_num_pairs + 1) {
BROTLI_FREE(m, pairs);
pairs = BROTLI_ALLOC(m, HistogramPair, max_num_pairs + 1);
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(pairs)) return;
}

clusters = BROTLI_ALLOC(m, uint32_t, num_clusters);
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(clusters)) return;
for (i = 0; i < num_clusters; ++i) {
Expand All @@ -298,6 +314,7 @@ static void FN(ClusterBlocks)(MemoryManager* m,
BROTLI_FREE(m, pairs);
BROTLI_FREE(m, cluster_size);

/* Assign blocks to final histograms. */
new_index = BROTLI_ALLOC(m, uint32_t, num_clusters);
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(new_index)) return;
for (i = 0; i < num_clusters; ++i) new_index[i] = kInvalidIndex;
Expand All @@ -313,6 +330,8 @@ static void FN(ClusterBlocks)(MemoryManager* m,
for (j = 0; j < block_lengths[i]; ++j) {
FN(HistogramAdd)(&histo, data[pos++]);
}
/* Among equally good histograms prefer last used. */
/* TODO: should we give a block-switch discount here? */
best_out = (i == 0) ? histogram_symbols[0] : histogram_symbols[i - 1];
best_bits =
FN(BrotliHistogramBitCostDistance)(&histo, &all_histograms[best_out]);
Expand All @@ -337,6 +356,9 @@ static void FN(ClusterBlocks)(MemoryManager* m,
BROTLI_ENSURE_CAPACITY(
m, uint32_t, split->lengths, split->lengths_alloc_size, num_blocks);
if (BROTLI_IS_OOM(m)) return;

/* Rewrite final assignment to block-split. There might be less blocks
* than |num_blocks| due to clustering. */
{
uint32_t cur_length = 0;
size_t block_idx = 0;
Expand All @@ -361,24 +383,36 @@ static void FN(ClusterBlocks)(MemoryManager* m,
BROTLI_FREE(m, histogram_symbols);
}

/* Create BlockSplit (partitioning) given the limits, estimates and "effort"
* parameters.
*
* NB: max_histograms is often less than number of histograms allowed by format;
* this is done intentionally, to save some "space" for context-aware
* clustering (here entropy is estimated for context-free symbols). */
static void FN(SplitByteVector)(MemoryManager* m,
const DataType* data, const size_t length,
const size_t literals_per_histogram,
const size_t symbols_per_histogram,
const size_t max_histograms,
const size_t sampling_stride_length,
const double block_switch_cost,
const BrotliEncoderParams* params,
BlockSplit* split) {
const size_t data_size = FN(HistogramDataSize)();
size_t num_histograms = length / literals_per_histogram + 1;
HistogramType* histograms;
/* Calculate number of histograms; initial estimate is one histogram per
* specified amount of symbols; however, this value is capped. */
size_t num_histograms = length / symbols_per_histogram + 1;
if (num_histograms > max_histograms) {
num_histograms = max_histograms;
}

/* Corner case: no input. */
if (length == 0) {
split->num_types = 1;
return;
} else if (length < kMinLengthForBlockSplitting) {
}

if (length < kMinLengthForBlockSplitting) {
BROTLI_ENSURE_CAPACITY(m, uint8_t,
split->types, split->types_alloc_size, split->num_blocks + 1);
BROTLI_ENSURE_CAPACITY(m, uint32_t,
Expand Down
59 changes: 42 additions & 17 deletions brotli/enc/hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#ifndef BROTLI_ENC_HASH_H_
#define BROTLI_ENC_HASH_H_

#include <stdlib.h> /* exit */
#include <string.h> /* memcmp, memset */

#include "../common/constants.h"
Expand All @@ -28,15 +29,28 @@ extern "C" {
#endif

typedef struct {
/* Dynamically allocated area; first member for quickest access. */
void* extra;
/**
* Dynamically allocated areas; regular hasher uses one or two allocations;
* "composite" hasher uses up to 4 allocations.
*/
void* extra[4];

/**
* False before the fisrt invocation of HasherSetup (where "extra" memory)
* is allocated.
*/
BROTLI_BOOL is_setup_;

size_t dict_num_lookups;
size_t dict_num_matches;

BrotliHasherParams params;

/* False if hasher needs to be "prepared" before use. */
/**
* False if hasher needs to be "prepared" before use (before the first
* invocation of HasherSetup or after HasherReset). "preparation" is hasher
* data initialization (using input ringbuffer).
*/
BROTLI_BOOL is_prepared_;
} HasherCommon;

Expand Down Expand Up @@ -391,42 +405,52 @@ typedef struct {

/* MUST be invoked before any other method. */
static BROTLI_INLINE void HasherInit(Hasher* hasher) {
hasher->common.extra = NULL;
hasher->common.is_setup_ = BROTLI_FALSE;
hasher->common.extra[0] = NULL;
hasher->common.extra[1] = NULL;
hasher->common.extra[2] = NULL;
hasher->common.extra[3] = NULL;
}

static BROTLI_INLINE void DestroyHasher(MemoryManager* m, Hasher* hasher) {
if (hasher->common.extra == NULL) return;
BROTLI_FREE(m, hasher->common.extra);
if (hasher->common.extra[0] != NULL) BROTLI_FREE(m, hasher->common.extra[0]);
if (hasher->common.extra[1] != NULL) BROTLI_FREE(m, hasher->common.extra[1]);
if (hasher->common.extra[2] != NULL) BROTLI_FREE(m, hasher->common.extra[2]);
if (hasher->common.extra[3] != NULL) BROTLI_FREE(m, hasher->common.extra[3]);
}

static BROTLI_INLINE void HasherReset(Hasher* hasher) {
hasher->common.is_prepared_ = BROTLI_FALSE;
}

static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
BROTLI_BOOL one_shot, const size_t input_size) {
static BROTLI_INLINE void HasherSize(const BrotliEncoderParams* params,
BROTLI_BOOL one_shot, const size_t input_size, size_t* alloc_size) {
switch (params->hasher.type) {
#define SIZE_(N) \
case N: \
return HashMemAllocInBytesH ## N(params, one_shot, input_size);
#define SIZE_(N) \
case N: \
HashMemAllocInBytesH ## N(params, one_shot, input_size, alloc_size); \
break;
FOR_ALL_HASHERS(SIZE_)
#undef SIZE_
default:
break;
}
return 0; /* Default case. */
}

static BROTLI_INLINE void HasherSetup(MemoryManager* m, Hasher* hasher,
BrotliEncoderParams* params, const uint8_t* data, size_t position,
size_t input_size, BROTLI_BOOL is_last) {
BROTLI_BOOL one_shot = (position == 0 && is_last);
if (hasher->common.extra == NULL) {
size_t alloc_size;
if (!hasher->common.is_setup_) {
size_t alloc_size[4] = {0};
size_t i;
ChooseHasher(params, &params->hasher);
alloc_size = HasherSize(params, one_shot, input_size);
hasher->common.extra = BROTLI_ALLOC(m, uint8_t, alloc_size);
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(hasher->common.extra)) return;
HasherSize(params, one_shot, input_size, alloc_size);
for (i = 0; i < 4; ++i) {
if (alloc_size[i] == 0) continue;
hasher->common.extra[i] = BROTLI_ALLOC(m, uint8_t, alloc_size[i]);
if (BROTLI_IS_OOM(m) || BROTLI_IS_NULL(hasher->common.extra[i])) return;
}
hasher->common.params = params->hasher;
switch (hasher->common.params.type) {
#define INITIALIZE_(N) \
Expand All @@ -440,6 +464,7 @@ static BROTLI_INLINE void HasherSetup(MemoryManager* m, Hasher* hasher,
break;
}
HasherReset(hasher);
hasher->common.is_setup_ = BROTLI_TRUE;
}

if (!hasher->common.is_prepared_) {
Expand Down
Loading

0 comments on commit 25ec20a

Please sign in to comment.