From c51c9c3a48d522d4ffd32cf74425152fe8352efd Mon Sep 17 00:00:00 2001 From: Morten Grouleff Date: Wed, 11 Sep 2024 16:16:09 +0200 Subject: [PATCH] Allow including compression level when training a dictionary: The compression improves when the level for the training is close to the level for the compression step. (100% Compatible for the public API.) --- src/main/java/com/github/luben/zstd/Zstd.java | 39 +++++++++++++++++-- src/main/native/dictBuilder/zdict.c | 9 +++-- src/main/native/jni_zdict.c | 10 +++-- src/main/native/zdict.h | 3 +- 4 files changed, 49 insertions(+), 12 deletions(-) diff --git a/src/main/java/com/github/luben/zstd/Zstd.java b/src/main/java/com/github/luben/zstd/Zstd.java index c15c2e0..e79ca2a 100644 --- a/src/main/java/com/github/luben/zstd/Zstd.java +++ b/src/main/java/com/github/luben/zstd/Zstd.java @@ -862,12 +862,26 @@ public static long getDirectByteBufferFrameContentSize(ByteBuffer src, int srcPo * it fails (which can be tested using ZSTD_isError()) */ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy) { + return trainFromBuffer(samples, dictBuffer, legacy, defaultCompressionLevel()); + } + + /** + * Creates a new dictionary to tune a kind of samples + * + * @param samples the samples buffer array + * @param dictBuffer the new dictionary buffer + * @param legacy use the legacy training algorithm; otherwise cover + * @param compressionLevel optimal if using the same level as when compressing. + * @return the number of bytes into buffer 'dictBuffer' or an error code if + * it fails (which can be tested using ZSTD_isError()) + */ + public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel) { if (samples.length <= 10) { throw new ZstdException(Zstd.errGeneric(), "nb of samples too low"); } - return trainFromBuffer0(samples, dictBuffer, legacy); + return trainFromBuffer0(samples, dictBuffer, legacy, compressionLevel); } - private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy); + private static native long trainFromBuffer0(byte[][] samples, byte[] dictBuffer, boolean legacy, int compressionLevel); /** * Creates a new dictionary to tune a kind of samples @@ -880,12 +894,29 @@ public static long trainFromBuffer(byte[][] samples, byte[] dictBuffer, boolean * it fails (which can be tested using ZSTD_isError()) */ public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy) { + return trainFromBufferDirect(samples, sampleSizes, dictBuffer, legacy, defaultCompressionLevel()); + } + + /** + * Creates a new dictionary to tune a kind of samples + * + * @param samples the samples direct byte buffer array + * @param sampleSizes java integer array of sizes + * @param dictBuffer the new dictionary buffer (preallocated direct byte buffer) + * @param legacy use the legacy training algorithm; oter + * @param compressionLevel optimal if using the same level as when compressing. + * @return the number of bytes into buffer 'dictBuffer' or an error code if + * it fails (which can be tested using ZSTD_isError()) + */ + public static long trainFromBufferDirect(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel) { if (sampleSizes.length <= 10) { throw new ZstdException(Zstd.errGeneric(), "nb of samples too low"); } - return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy); + return trainFromBufferDirect0(samples, sampleSizes, dictBuffer, legacy, compressionLevel); } - private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy); + + + private static native long trainFromBufferDirect0(ByteBuffer samples, int[] sampleSizes, ByteBuffer dictBuffer, boolean legacy, int compressionLevel); /** * Get DictId from a compressed frame diff --git a/src/main/native/dictBuilder/zdict.c b/src/main/native/dictBuilder/zdict.c index 82e999e..6e62d89 100644 --- a/src/main/native/dictBuilder/zdict.c +++ b/src/main/native/dictBuilder/zdict.c @@ -1105,15 +1105,18 @@ size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity, size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, - const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples) + const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, int compressionLevel) { ZDICT_fastCover_params_t params; DEBUGLOG(3, "ZDICT_trainFromBuffer"); memset(¶ms, 0, sizeof(params)); params.d = 8; params.steps = 4; - /* Use default level since no compression level information is available */ - params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; + if (compressionLevel <= 0) { + params.zParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; + } else { + params.zParams.compressionLevel = compressionLevel; + } #if defined(DEBUGLEVEL) && (DEBUGLEVEL>=1) params.zParams.notificationLevel = DEBUGLEVEL; #endif diff --git a/src/main/native/jni_zdict.c b/src/main/native/jni_zdict.c index 66f66e0..5aa9e6b 100644 --- a/src/main/native/jni_zdict.c +++ b/src/main/native/jni_zdict.c @@ -8,7 +8,7 @@ #include JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0 - (JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy) { + (JNIEnv *env, jclass obj, jobjectArray samples, jbyteArray dictBuffer, jboolean legacy, jint compressionLevel) { size_t size = 0; jsize num_samples = (*env)->GetArrayLength(env, samples); size_t *samples_sizes = malloc(sizeof(size_t) * num_samples); @@ -45,9 +45,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBuffer0 if (legacy == JNI_TRUE) { ZDICT_legacy_params_t params; memset(¶ms,0,sizeof(params)); + params.zParams.compressionLevel = compressionLevel; size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params); } else { - size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples); + size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel); } (*env)->ReleasePrimitiveArrayCritical(env, dictBuffer, dict_buff, 0); free(samples_buffer); @@ -56,7 +57,7 @@ E1: return size; } JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0 - (JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy) { + (JNIEnv *env, jclass obj, jobject samples, jintArray sampleSizes, jobject dictBuffer, jboolean legacy, jint compressionLevel) { size_t size = 0; void *samples_buffer = (*env)->GetDirectBufferAddress(env, samples); @@ -81,9 +82,10 @@ JNIEXPORT jlong Java_com_github_luben_zstd_Zstd_trainFromBufferDirect0 if (legacy == JNI_TRUE) { ZDICT_legacy_params_t params; memset(¶ms, 0, sizeof(params)); + params.zParams.compressionLevel = compressionLevel; size = ZDICT_trainFromBuffer_legacy(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, params); } else { - size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples); + size = ZDICT_trainFromBuffer(dict_buff, dict_capacity, samples_buffer, samples_sizes, num_samples, compressionLevel); } E2: free(samples_sizes); E1: return size; diff --git a/src/main/native/zdict.h b/src/main/native/zdict.h index 2268f94..7eea29f 100644 --- a/src/main/native/zdict.h +++ b/src/main/native/zdict.h @@ -209,7 +209,8 @@ extern "C" { */ ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, const void* samplesBuffer, - const size_t* samplesSizes, unsigned nbSamples); + const size_t* samplesSizes, unsigned nbSamples, + int compressionLevel); typedef struct { int compressionLevel; /**< optimize for a specific zstd compression level; 0 means default */