* The author would like to thank Hex-Rays SA for helping develop many…

… of these changes and pointing out their importance and use case. * Implement official XZ/CRC-64 and XZ/CRC-32 routines, since different checksum algorithms use different tables/polynominals and mechanisms, and especially in the case of XZ/CRC-64, it is a fairly unique algorithm (*not* ECMA-182). Since the XZ standard places the algorithm and sample code as part of their RFC, we'll call this XzCrc.c. These routines are only compiled in with MINLZ_INTEGRITY_CHECKS. * Introduce a new API -- XzChecksumError -- which returns if the last decoded file encountered CRC errors (always returning false if MINLZ_INTEGRITY_CHECKS isn't used). This is to go alongside new behavior whereas checksum failures will no longer fail the decoding process but instead attempting partial decompression anyway, especially in helpful cases where archive size might be the only thing needed. This allows users to potentially implement their own error recovery mechanism, instead of having the API fail at all times. * Support (by ignoring) checksums other than XZ/CRC-32 and XZ/CRC-64, instead of failing. Correctly handle unrecognized checksum sizes, which are still defined in the standard. * Force MINLZ_META_CHECKS to be defined when MINLZ_INTEGRITY_CHECKS is used, as it is a dependency. * Remove XZ_DECODE_BLOCK_HEADER_RESULT return value and go back to a single true/false. If the file is empty, returning false will skip the call to XzDecodeBlock and set the output size to 0 bytes, and then, if meta-data checks are enabled, scan the index and stream footer -- if the file was truncated, these would fail anyway, but in the case of an empty file, the index and footer will be parsed OK. This has the benefit of correctly zero-initializing the output size if there appears not to be a valid block, as well. * Introduce const-correctness in a number of places. * Some small number of stylistic changes, such as increased constant usage instead of magics. * Guard #pragma warning for _MSC_VER only, for support with other/older compilers. * Bump version to 1.1.5 to recognize these changes.
ionescu007 · May 25, 2021 · 1f5976d · 1f5976d
1 parent e78970e
commit 1f5976d
Show file tree

Hide file tree

Showing 11 changed files with 359 additions and 109 deletions.
diff --git a/minlzdec/minlzdec.c b/minlzdec/minlzdec.c
@@ -22,6 +22,7 @@ main (
     uint32_t inputSize, outputSize;
     uint8_t* inputBuffer;
     uint8_t* outputBuffer;
+    char continueResult;
     struct stat stat;
     bool decodeResult;
 
@@ -30,8 +31,8 @@ main (
     inputBuffer = NULL;
     outputBuffer = NULL;
 
-    printf("minlzdec v.1.1.1 -- http://ionescu007.github.io/minlzma\n");
-    printf("Copyright(c) 2020 Alex Ionescu (@aionescu)\n\n");
+    printf("minlzdec v.1.1.5 -- http://ionescu007.github.io/minlzma\n");
+    printf("Copyright(c) 2020-2021 Alex Ionescu (@aionescu)\n\n");
     if (ArgumentCount != 3)
     {
         printf("Usage: minlzdec [INPUT FILE] [OUTPUT FILE]\n");
@@ -74,6 +75,16 @@ main (
         errno = ENOTSUP;
         goto Cleanup;
     }
+    else if (XzChecksumError())
+    {
+        printf("WARNING: Checksum error was encountered, continue decompression? [Y/N]\n");
+        fgets(&continueResult, 1, stdin);
+        if (continueResult != L'Y')
+        {
+            errno = EIO;
+            goto Cleanup;
+        }
+    }
 
     printf("Decompressed file will be %d bytes (%f%% ratio)\n",
             outputSize, (double)inputSize / (double)outputSize);

diff --git a/minlzlib/CMakeLists.txt b/minlzlib/CMakeLists.txt
@@ -1,4 +1,4 @@
-set(MINLZLIB_SOURCES "inputbuf.c" "dictbuf.c" "lzma2dec.c" "lzmadec.c" "rangedec.c" "xzstream.c" "lzmadec.h" "xzstream.h" "minlzlib.h")
+set(MINLZLIB_SOURCES "inputbuf.c" "dictbuf.c" "lzma2dec.c" "lzmadec.c" "rangedec.c" "xzcrc.c" "xzstream.c" "lzmadec.h" "xzstream.h" "minlzlib.h")
 add_library(minlz_obj OBJECT ${MINLZLIB_SOURCES})
 set_target_properties(minlz_obj PROPERTIES C_STANDARD 11 C_STANDARD_REQUIRED YES C_EXTENSIONS NO)
 

diff --git a/minlzlib/inputbuf.c b/minlzlib/inputbuf.c
@@ -35,7 +35,7 @@ typedef struct _BUFFER_STATE
     //
     // Start of the buffer, current offset, current packet end, and total input size
     //
-    uint8_t* Buffer;
+    const uint8_t* Buffer;
     uint32_t Offset;
     uint32_t SoftLimit;
     uint32_t Size;
@@ -85,7 +85,7 @@ BfResetSoftLimit (
 bool
 BfSeek (
     uint32_t Length,
-    uint8_t** Bytes
+    const uint8_t** Bytes
     )
 {
     //
@@ -107,7 +107,7 @@ BfRead (
     uint8_t* Byte
     )
 {
-    uint8_t* pByte;
+    const uint8_t* pByte;
     //
     // Seek past the byte and read it
     //
@@ -122,7 +122,7 @@ BfRead (
 
 void
 BfInitialize (
-    uint8_t* InputBuffer,
+    const uint8_t* InputBuffer,
     uint32_t InputSize
     )
 {

diff --git a/minlzlib/lzma2dec.c b/minlzlib/lzma2dec.c
@@ -72,7 +72,7 @@ Lz2DecodeStream (
     bool GetSizeOnly
     )
 {
-    uint8_t* inBytes;
+    const uint8_t* inBytes;
     LZMA2_CONTROL_BYTE controlByte;
     uint8_t propertyByte;
     uint32_t rawSize;
@@ -150,10 +150,10 @@ Lz2DecodeStream (
         {
             LzResetState();
         }
-        //
-        // else controlByte.u.Lzma.ResetState == Lzma2NoReset, since a two-bit
-        // field only has four possible values
-        //
+        else if (controlByte.u.Lzma.ResetState == Lzma2NoReset)
+        {
+            ;
+        }
 
         //
         // Don't do any decompression if the caller only wants to know the size

diff --git a/minlzlib/lzma2dec.h b/minlzlib/lzma2dec.h
@@ -23,7 +23,9 @@ Module Name:
 --*/
 
 #pragma once
+#ifdef _MSC_VER
 #pragma warning(disable:4214)
+#endif
 
 //
 // The most complex LZMA sequence possible is a "match" sequence where the

diff --git a/minlzlib/minlzlib.h b/minlzlib/minlzlib.h
@@ -36,9 +36,9 @@ Module Name:
 // Input Buffer Management
 //
 bool BfRead(uint8_t* Byte);
-bool BfSeek(uint32_t Length, uint8_t** Bytes);
+bool BfSeek(uint32_t Length, const uint8_t** Bytes);
 bool BfAlign(void);
-void BfInitialize(uint8_t* InputBuffer, uint32_t InputSize);
+void BfInitialize(const uint8_t* InputBuffer, uint32_t InputSize);
 bool BfSetSoftLimit(uint32_t Remaining);
 void BfResetSoftLimit(void);
 
@@ -80,9 +80,16 @@ void LzResetState(void);
 bool Lz2DecodeStream(uint32_t* BytesProcessed, bool GetSizeOnly);
 
 #ifdef MINLZ_INTEGRITY_CHECKS
+//
+// Integrity checks require metadata parsing and validation
+//
+#define MINLZ_META_CHECKS 1
+
 //
 // Checksum Management
 //
-uint32_t OsComputeCrc32(uint32_t Initial, const uint8_t* Data, uint32_t Length);
-#define Crc32(Buffer, Length) OsComputeCrc32(0, (const uint8_t*)Buffer, Length)
+uint32_t XzCrc32(uint32_t Crc, const uint8_t* Buffer, uint32_t Length);
+uint64_t XzCrc64(uint64_t Crc, const uint8_t* Buffer, uint32_t Length);
+#define Crc32(Buffer, Length) XzCrc32(0, (const uint8_t*)Buffer, Length)
+#define Crc64(Buffer, Length) XzCrc64(0, (const uint8_t*)Buffer, Length)
 #endif
diff --git a/minlzlib/rangedec.c b/minlzlib/rangedec.c
@@ -67,8 +67,8 @@ typedef struct _RANGE_DECODER_STATE
     //
     // Start and end location of the current stream's range encoder buffer
     //
-    uint8_t* Start;
-    uint8_t* Limit;
+    const uint8_t* Start;
+    const uint8_t* Limit;
     //
     // Current probability range and 32-bit arithmetic encoded sequence code
     //
@@ -83,7 +83,7 @@ RcInitialize (
     )
 {
     uint8_t i, rcByte;
-    uint8_t* chunkEnd;
+    const uint8_t* chunkEnd;
 
     //
     // Make sure that the input buffer has enough space for the requirements of
@@ -127,7 +127,7 @@ RcCanRead (
     void
     )
 {
-    uint8_t* pos;
+    const uint8_t* pos;
     //
     // We can keep reading symbols as long as we haven't reached the end of the
     // input buffer yet.
@@ -141,7 +141,7 @@ RcIsComplete (
     uint32_t* BytesProcessed
     )
 {
-    uint8_t* pos;
+    const uint8_t* pos;
     //
     // When the last symbol has been decoded, the last code should be zero as
     // there is nothing left to describe. Return the offset in the buffer where

diff --git a/minlzlib/xzcrc.c b/minlzlib/xzcrc.c
@@ -0,0 +1,162 @@
+/*++
+
+Copyright (c) Alex Ionescu.  All rights reserved.
+
+Module Name:
+
+    xzcrc.c
+
+Abstract:
+
+    This module implements the XZ checksum algorithms for CRC32 and CRC64. The
+    latter is a specialized implementation (ofter mislabelled "ECMA-182") which
+    is only available in Go, making it highly unlikely to be found in any other
+    OS or language runtime. See the XZ Format Specification, Section 6.
+
+Author:
+
+    Alex Ionescu (@aionescu) 15-May-2021 - Initial version
+
+Environment:
+
+    Windows & Linux, user mode and kernel mode.
+
+--*/
+
+#include "minlzlib.h"
+
+#ifdef MINLZ_INTEGRITY_CHECKS
+const uint32_t k_Crc32Polynomial = UINT32_C(0xEDB88320);
+const uint64_t k_Crc64Polynomial = UINT64_C(0xC96C5795D7870F42);
+
+//
+// XZ CRC State
+//
+typedef struct _CHECKSUM_STATE
+{
+    uint32_t Crc32Table[256];
+    uint64_t Crc64Table[256];
+    bool Initialized;
+} CHECKSUM_STATE, * PCHECKSUM_STATE;
+CHECKSUM_STATE Checksum;
+
+void
+XzCrcInitialize (
+    void
+    )
+{
+    uint32_t i;
+    uint32_t j;
+    uint32_t crc32;
+    uint64_t crc64;
+
+    //
+    // Don't do anything if the tables are already computed
+    //
+    if (!Checksum.Initialized)
+    {
+        //
+        // Build a table of all possible CRC values for each byte, essentially
+        // creating the checksums for either 00 00 00 XX in the case of 32-bit
+        // CRC or for 00 00 00 00 00 00 XX in the base of 64-bit CRC.
+        //
+        for (i = 0; i < 256; i++)
+        {
+            crc32 = i;
+            crc64 = i;
+
+            //
+            // Divide the input in the 8 coefficients, where the LSB represents
+            // the coefficient of the highest degree term of the dividend.
+            //
+            for (j = 0; j < 8; j++)
+            {
+                //
+                // Is the current coefficient set?
+                //
+                if (crc32 & 1)
+                {
+                    //
+                    // Move to next coefficient and add the rest of the divisor
+                    //
+                    crc32 = (crc32 >> 1) ^ k_Crc32Polynomial;
+                }
+                else
+                {
+                    //
+                    // Skip this and move to the next coefficient
+                    //
+                    crc32 >>= 1;
+                }
+
+                //
+                // Compute the 64-bit entry using the same algorithm
+                //
+                if (crc64 & 1)
+                {
+                    crc64 = (crc64 >> 1) ^ k_Crc64Polynomial;
+                }
+                else
+                {
+                    crc64 >>= 1;
+                }
+            }
+
+            //
+            // Store the final generated result
+            //
+            Checksum.Crc32Table[i] = crc32;
+            Checksum.Crc64Table[i] = crc64;
+        }
+
+        //
+        // No need to do this again
+        //
+        Checksum.Initialized = true;
+    }
+}
+
+uint32_t
+XzCrc32 (
+    uint32_t Crc,
+    const uint8_t *Buffer,
+    uint32_t Length
+    )
+{
+    uint32_t i;
+
+    //
+    // This uses the Dilip V. Sarwate algorithm which shifts one byte at a time
+    // and produces an intermediate remainder which can then be subtracted from
+    // the lookup table by using the high 8 bits as an index (since we computed
+    // all possible one-byte inputs). This relies on the following property:
+    //
+    // Mod(A * x^n, P(x)) = Mod(x^n * Mod(A, P(X)), P(X))
+    //
+    for (XzCrcInitialize(), Crc = ~Crc, i = 0; i < Length; ++i)
+    {
+        Crc = Checksum.Crc32Table[Buffer[i] ^ (Crc & 0xFF)] ^ (Crc >> 8);
+    }
+    return ~Crc;
+}
+
+uint64_t
+XzCrc64 (
+    uint64_t Crc,
+    const uint8_t *Buffer,
+    uint32_t Length
+    )
+{
+    uint32_t i;
+    //
+    // Use the same algorithm to the 64-bit case too. Note that for very large
+    // input data, a parallel "slicing by 8" approach would yield much faster
+    // results (as would a "slicing by 4" approach for the 32-bit CRC case).
+    //
+    for (XzCrcInitialize(), Crc = ~Crc, i = 0; i < Length; ++i)
+    {
+        Crc = Checksum.Crc64Table[Buffer[i] ^ (Crc & 0xFF)] ^ (Crc >> 8);
+    }
+    return ~Crc;
+}
+#endif