Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix dictionary training huffman segfault and small speed improvement #2773

Merged
merged 3 commits into from
Sep 14, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions lib/compress/huf_compress.c
Original file line number Diff line number Diff line change
Expand Up @@ -435,7 +435,7 @@ typedef struct {
typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32];

/* Number of buckets available for HUF_sort() */
#define RANK_POSITION_TABLE_SIZE 128
#define RANK_POSITION_TABLE_SIZE 192

typedef struct {
huffNodeTable huffNodeTbl;
Expand All @@ -444,18 +444,15 @@ typedef struct {

/* RANK_POSITION_DISTINCT_COUNT_CUTOFF == Cutoff point in HUF_sort() buckets for which we use log2 bucketing.
* Strategy is to use as many buckets as possible for representing distinct
* counts while using the remainder to represent all counts up to HUF_BLOCKSIZE_MAX
* using log2 bucketing.
* counts while using the remainder to represent all "large" counts.
*
* To satisfy this requirement for 128 buckets, we can do the following:
* Let buckets 0-114 represent distinct counts of [0, 114]
* Let buckets 115 to 126 represent counts of [115, HUF_BLOCKSIZE_MAX]. (the final bucket 127 must remain empty)
*
* Note that we don't actually need 17 buckets (assuming 2^17 maxcount) for log2 bucketing since
* the first few buckets in the log2 bucketing representation are already covered by the distinct count bucketing.
* To satisfy this requirement for 192 buckets, we can do the following:
* Let buckets 0-166 represent distinct counts of [0, 166]
* Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing.
*/
#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - BIT_highbit32(HUF_BLOCKSIZE_MAX) - 1
#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN)
#define RANK_POSITION_MAX_COUNT_LOG 32
#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */
#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */

/* Return the appropriate bucket index for a given count. See definition of
* RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy.
Expand Down
9 changes: 7 additions & 2 deletions tests/playTests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -936,8 +936,13 @@ cat tmp | zstd -14 -f --size-hint=5500 | zstd -t # considerably too low


println "\n===> dictionary tests "

println "- test with raw dict (content only) "
println "- Test high/low compressibility corpus training"
datagen -g12M -P90 > tmpCorpusHighCompress
datagen -g12M -P5 > tmpCorpusLowCompress
zstd --train -B2K tmpCorpusHighCompress -o tmpDictHighCompress
zstd --train -B2K tmpCorpusLowCompress -o tmpDictLowCompress
rm -f tmpCorpusHighCompress tmpCorpusLowCompress tmpDictHighCompress tmpDictLowCompress
println "- Test with raw dict (content only) "
datagen > tmpDict
datagen -g1M | $MD5SUM > tmp1
datagen -g1M | zstd -D tmpDict | zstd -D tmpDict -dvq | $MD5SUM > tmp2
Expand Down
Loading