[bug-fix] Fix rare corruption bug affecting the block splitter

The block splitter confuses sequences with literal length == 65536 that use a repeat offset code. It interprets this as literal length == 0 when deciding the meaning of the repeat offset, and corrupts the repeat offset history. This is benign, merely causing suboptimal compression performance, if the confused history is flushed before the end of the block, e.g. if there are 3 consecutive non-repeat code sequences after the mistake. It also is only triggered if the block splitter decided to split the block. All that to say: This is a rare bug, and requires quite a few conditions to trigger. However, the good news is that if you have a way to validate that the decompressed data is correct, e.g. you've enabled zstd's checksum or have a checksum elsewhere, the original data is very likely recoverable. So if you were affected by this bug please reach out. The fix is to remind the block splitter that the literal length is actually 64K. The test case is a bit tricky to set up, but I've managed to reproduce the issue. Thanks to @danlark1 for alerting us to the issue and providing us a reproducer!
facebook · Feb 23, 2023 · 395a2c5 · 395a2c5 · AV-Coding · Mar 26, 2024
1 parent 4ebaf36
commit 395a2c5
Show file tree

Hide file tree

Showing 4 changed files with 74 additions and 1 deletion.
diff --git a/lib/compress/zstd_compress.c b/lib/compress/zstd_compress.c
@@ -3859,9 +3859,10 @@ ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRe
                         const seqStore_t* const seqStore, U32 const nbSeq)
 {
     U32 idx = 0;
+    U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq;
     for (; idx < nbSeq; ++idx) {
         seqDef* const seq = seqStore->sequencesStart + idx;
-        U32 const ll0 = (seq->litLength == 0);
+        U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx);
         U32 const offBase = seq->offBase;
         assert(offBase > 0);
         if (OFFBASE_IS_REPCODE(offBase)) {

diff --git a/tests/cli-tests/compression/golden.sh b/tests/cli-tests/compression/golden.sh
@@ -10,3 +10,7 @@ zstd -r -t golden-compressed/
 
 zstd --target-compressed-block-size=1024 -rf golden/ --output-dir-mirror golden-compressed/
 zstd -r -t golden-compressed/
+
+# PR #3517 block splitter corruption test
+zstd -rf -19 --zstd=mml=7 golden/ --output-dir-mirror golden-compressed/
+zstd -r -t golden-compressed/
diff --git a/tests/fuzzer.c b/tests/fuzzer.c
@@ -1286,6 +1286,73 @@ static int basicUnitTests(U32 const seed, double compressibility)
     }
     DISPLAYLEVEL(3, "OK \n");
 
+    DISPLAYLEVEL(3, "test%3i : Check block splitter with 64K literal length : ", testNb++);
+    {   ZSTD_CCtx* cctx = ZSTD_createCCtx();
+        size_t const srcSize = 256 * 1024;
+        U32 const compressibleLenU32 = 32 * 1024 / 4;
+        U32 const blockSizeU32 = 128 * 1024 / 4;
+        U32 const litLenU32 = 64 * 1024 / 4;
+        U32* data = (U32*)malloc(srcSize);
+        size_t dSize;
+
+        if (data == NULL || cctx == NULL) goto _output_error;
+
+        /* Generate data without any matches */
+        RDG_genBuffer(data, srcSize, 0.0, 0.01, 2654435761U);
+        /* Generate 32K of compressible data */
+        RDG_genBuffer(data, compressibleLenU32 * 4, 0.5, 0.5, 0xcafebabe);
+
+        /* Add a match of offset=12, length=8 at idx=16, 32, 48, 64  */
+        data[compressibleLenU32 + 0] = 0xFFFFFFFF;
+        data[compressibleLenU32 + 1] = 0xEEEEEEEE;
+        data[compressibleLenU32 + 4] = 0xFFFFFFFF;
+        data[compressibleLenU32 + 5] = 0xEEEEEEEE;
+
+        /* Add a match of offset=16, length=8 at idx=64K + 64.
+         * This generates a sequence with llen=64K, and repeat code 1.
+         * The block splitter thought this was ll0, and corrupted the
+         * repeat offset history.
+         */
+        data[compressibleLenU32 + litLenU32 + 2 + 0] = 0xDDDDDDDD;
+        data[compressibleLenU32 + litLenU32 + 2 + 1] = 0xCCCCCCCC;
+        data[compressibleLenU32 + litLenU32 + 2 + 4] = 0xDDDDDDDD;
+        data[compressibleLenU32 + litLenU32 + 2 + 5] = 0xCCCCCCCC;
+
+        /* Add a match of offset=16, length=8 at idx=128K + 16.
+         * This should generate a sequence with repeat code = 1.
+         * But the block splitters mistake caused zstd to generate
+         * repeat code = 2, corrupting the data.
+         */
+        data[blockSizeU32] = 0xBBBBBBBB;
+        data[blockSizeU32 + 1] = 0xAAAAAAAA;
+        data[blockSizeU32 + 4] = 0xBBBBBBBB;
+        data[blockSizeU32 + 5] = 0xAAAAAAAA;
+
+        /* Generate a golden file from this data in case datagen changes and
+         * doesn't generate the exact same data. We will also test this golden file.
+         */
+        if (0) {
+            FILE* f = fopen("golden-compression/PR-3517-block-splitter-corruption-test", "wb");
+            fwrite(data, 1, srcSize, f);
+            fclose(f);
+        }
+
+        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, 19));
+        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, 7));
+        CHECK_Z(ZSTD_CCtx_setParameter(cctx, ZSTD_c_useBlockSplitter, ZSTD_ps_enable));
+
+        cSize = ZSTD_compress2(cctx, compressedBuffer, compressedBufferSize, data, srcSize);
+        CHECK_Z(cSize);
+        dSize = ZSTD_decompress(decodedBuffer, CNBuffSize, compressedBuffer, cSize);
+        CHECK_Z(dSize);
+        CHECK_EQ(dSize, srcSize);
+        CHECK(!memcmp(decodedBuffer, data, srcSize));
+
+        free(data);
+        ZSTD_freeCCtx(cctx);
+    }
+    DISPLAYLEVEL(3, "OK \n");
+
     DISPLAYLEVEL(3, "test%3d: superblock uncompressible data, too many nocompress superblocks : ", testNb++);
     {
         ZSTD_CCtx* const cctx = ZSTD_createCCtx();

diff --git a/tests/golden-compression/PR-3517-block-splitter-corruption-test b/tests/golden-compression/PR-3517-block-splitter-corruption-test