Skip to content

Commit

Permalink
Allow including compression level when training a dictionary: The com…
Browse files Browse the repository at this point in the history
…pression improves when the level for the training is close to the level for the compression step. (100% Compatible for the public API.)
  • Loading branch information
Morten Grouleff authored and luben committed Sep 22, 2024
1 parent eea07fc commit 3ca26ee
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 12 deletions.
39 changes: 35 additions & 4 deletions src/main/java/com/github/luben/zstd/Zstd.java
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,26 @@ public static long getDirectByteBufferFrameContentSize(ByteBuffer src, int srcPo
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy) {
return trainFromBuffer(samples, dictBuffer, legacy, defaultCompressionLevel());
}

/**
* Creates a new dictionary to tune a kind of samples
*
* @param samples the samples buffer array
* @param dictBuffer the new dictionary buffer
* @param legacy use the legacy training algorithm; otherwise cover
* @param compressionLevel optimal if using the same level as when compressing.
* @return the number of bytes into buffer 'dictBuffer' or an error code if
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel) {
if (samples.length <= 10) {
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
}
return trainFromBuffer0(samples, dictBuffer, legacy);
return trainFromBuffer0(samples, dictBuffer, legacy, compressionLevel);
}
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy);
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel);

/**
* Creates a new dictionary to tune a kind of samples
Expand All @@ -880,12 +894,29 @@ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy) {
return trainFromBufferDirect(samples, sampleSizes, dictBuffer, legacy, defaultCompressionLevel());
}

/**
* Creates a new dictionary to tune a kind of samples
*
* @param samples the samples direct byte buffer array
* @param sampleSizes java integer array of sizes
* @param dictBuffer the new dictionary buffer (preallocated direct byte buffer)
* @param legacy use the legacy training algorithm; oter
* @param compressionLevel optimal if using the same level as when compressing.
* @return the number of bytes into buffer 'dictBuffer' or an error code if
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel) {
if (sampleSizes.length <= 10) {
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
}
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy);
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy, compressionLevel);
}
private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy);


private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel);

/**
* Get DictId from a compressed frame
Expand Down
9 changes: 6 additions & 3 deletions src/main/native/dictBuilder/zdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -1105,15 +1105,18 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,


size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, int compressionLevel)
{
ZDICT_fastCover_params_t params;
DEBUGLOG(3, "ZDICT_trainFromBuffer");
memset(&params, 0, sizeof(params));
params.d = 8;
params.steps = 4;
/* Use default level since no compression level information is available */
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
if (compressionLevel <= 0) {
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
} else {
params.zParams.compressionLevel = compressionLevel;
}
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
params.zParams.notificationLevel = DEBUGLEVEL;
#endif
Expand Down
10 changes: 6 additions & 4 deletions src/main/native/jni_zdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <string.h>

JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy) {
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy, jint compressionLevel) {
size_t size = 0;
jsize num_samples = (*env)->GetArrayLength(env, samples);
size_t *samples_sizes = malloc(sizeof(size_t) * num_samples);
Expand Down Expand Up @@ -45,9 +45,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
if (legacy == JNI_TRUE) {
ZDICT_legacy_params_t params;
memset(&params,0,sizeof(params));
params.zParams.compressionLevel = compressionLevel;
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
} else {
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
}
(*env)->ReleasePrimitiveArrayCritical(env, dictBuffer, dict_buff, 0);
free(samples_buffer);
Expand All @@ -56,7 +57,7 @@ E1: return size;
}

JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy) {
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy, jint compressionLevel) {

size_t size = 0;
void *samples_buffer = (*env)->GetDirectBufferAddress(env, samples);
Expand All @@ -81,9 +82,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
if (legacy == JNI_TRUE) {
ZDICT_legacy_params_t params;
memset(&params, 0, sizeof(params));
params.zParams.compressionLevel = compressionLevel;
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
} else {
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
}
E2: free(samples_sizes);
E1: return size;
Expand Down
3 changes: 2 additions & 1 deletion src/main/native/zdict.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ extern "C" {
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples);
const size_t* samplesSizes, unsigned nbSamples,
int compressionLevel);

typedef struct {
int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */
Expand Down

0 comments on commit 3ca26ee

Please sign in to comment.