Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow including compression level when training a dictionary #326

Merged
merged 1 commit into from
Sep 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 35 additions & 4 deletions src/main/java/com/github/luben/zstd/Zstd.java
Original file line number Diff line number Diff line change
Expand Up @@ -862,12 +862,26 @@ public static long getDirectByteBufferFrameContentSize(ByteBuffer src, int srcPo
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy) {
return trainFromBuffer(samples, dictBuffer, legacy, defaultCompressionLevel());
}

/**
* Creates a new dictionary to tune a kind of samples
*
* @param samples the samples buffer array
* @param dictBuffer the new dictionary buffer
* @param legacy use the legacy training algorithm; otherwise cover
* @param compressionLevel optimal if using the same level as when compressing.
* @return the number of bytes into buffer 'dictBuffer' or an error code if
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel) {
if (samples.length <= 10) {
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
}
return trainFromBuffer0(samples, dictBuffer, legacy);
return trainFromBuffer0(samples, dictBuffer, legacy, compressionLevel);
}
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy);
private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel);

/**
* Creates a new dictionary to tune a kind of samples
Expand All @@ -880,12 +894,29 @@ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy) {
return trainFromBufferDirect(samples, sampleSizes, dictBuffer, legacy, defaultCompressionLevel());
}

/**
* Creates a new dictionary to tune a kind of samples
*
* @param samples the samples direct byte buffer array
* @param sampleSizes java integer array of sizes
* @param dictBuffer the new dictionary buffer (preallocated direct byte buffer)
* @param legacy use the legacy training algorithm; oter
* @param compressionLevel optimal if using the same level as when compressing.
* @return the number of bytes into buffer 'dictBuffer' or an error code if
* it fails (which can be tested using ZSTD_isError())
*/
public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel) {
if (sampleSizes.length <= 10) {
throw new ZstdException(Zstd.errGeneric(), "nb of samples too low");
}
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy);
return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy, compressionLevel);
}
private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy);


private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel);

/**
* Get DictId from a compressed frame
Expand Down
9 changes: 6 additions & 3 deletions src/main/native/dictBuilder/zdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -1105,15 +1105,18 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,


size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, int compressionLevel)
{
ZDICT_fastCover_params_t params;
DEBUGLOG(3, "ZDICT_trainFromBuffer");
memset(&params, 0, sizeof(params));
params.d = 8;
params.steps = 4;
/* Use default level since no compression level information is available */
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
if (compressionLevel <= 0) {
params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT;
} else {
params.zParams.compressionLevel = compressionLevel;
}
#if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1)
params.zParams.notificationLevel = DEBUGLEVEL;
#endif
Expand Down
10 changes: 6 additions & 4 deletions src/main/native/jni_zdict.c
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#include <string.h>

JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy) {
(JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy, jint compressionLevel) {
size_t size = 0;
jsize num_samples = (*env)->GetArrayLength(env, samples);
size_t *samples_sizes = malloc(sizeof(size_t) * num_samples);
Expand Down Expand Up @@ -45,9 +45,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0
if (legacy == JNI_TRUE) {
ZDICT_legacy_params_t params;
memset(&params,0,sizeof(params));
params.zParams.compressionLevel = compressionLevel;
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
} else {
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
}
(*env)->ReleasePrimitiveArrayCritical(env, dictBuffer, dict_buff, 0);
free(samples_buffer);
Expand All @@ -56,7 +57,7 @@ E1: return size;
}

JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy) {
(JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy, jint compressionLevel) {

size_t size = 0;
void *samples_buffer = (*env)->GetDirectBufferAddress(env, samples);
Expand All @@ -81,9 +82,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0
if (legacy == JNI_TRUE) {
ZDICT_legacy_params_t params;
memset(&params, 0, sizeof(params));
params.zParams.compressionLevel = compressionLevel;
size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params);
} else {
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples);
size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel);
}
E2: free(samples_sizes);
E1: return size;
Expand Down
3 changes: 2 additions & 1 deletion src/main/native/zdict.h
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,8 @@ extern "C" {
*/
ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
const void* samplesBuffer,
const size_t* samplesSizes, unsigned nbSamples);
const size_t* samplesSizes, unsigned nbSamples,
int compressionLevel);

typedef struct {
int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */
Expand Down
Loading