From 2abee3538360e72bdaec5ee3683e359ad19a0352 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 7 Mar 2024 09:27:27 -0800 Subject: [PATCH] Use streaming decompress to identify extent of compressed data. (#66) This fixes a problem that arises for local files with bit 3 of the general purpose bit flag set. In this case, we don't get information up front about the size of the compressed data. So how do we know where the compressed data ends? Previously, we tried to determine this by looking for the signature of the data descriptor. But the data descriptor doesn't always HAVE a signature, and it is also possible for signatures to occur accidentally in the compressed data itself (#65). Here we follow a clue from an Info-ZIP note: "In general, this feature can only be reliably used together with compression methods that allow intrinsic detection of the 'end-of-compressed-data' condition." We use the streaming decompression interface from zlib's Internal module. This tells us, in effect, where the compressed data ends. A parameter has been added to getCompressedData for the compressionMethod, since we only want to do streaming decompression if the data is compressed with Deflate. Closes #65. --- src/Codec/Archive/Zip.hs | 111 +++++++++++++++++++++++++-------------- 1 file changed, 71 insertions(+), 40 deletions(-) diff --git a/src/Codec/Archive/Zip.hs b/src/Codec/Archive/Zip.hs index 42b9cc7..ebb6665 100644 --- a/src/Codec/Archive/Zip.hs +++ b/src/Codec/Archive/Zip.hs @@ -101,8 +101,6 @@ import Data.List (partition) import Data.Maybe (fromJust) #endif -import GHC.Int (Int64) - -- from bytestring import qualified Data.ByteString as S import qualified Data.ByteString.Lazy as B @@ -114,8 +112,11 @@ import qualified Data.Text.Lazy.Encoding as TL -- from zlib import qualified Codec.Compression.Zlib.Raw as Zlib +import qualified Codec.Compression.Zlib.Internal as ZlibInt import System.IO.Error (isAlreadyExistsError) +-- import Debug.Trace + manySig :: Word32 -> Get a -> Get [a] manySig sig p = do sig' <- lookAhead getWord32le @@ -768,7 +769,11 @@ getLocalFile = do getWord32le >>= ensure (== 0x04034b50) skip 2 -- version bitflag <- getWord16le - skip 2 -- compressionMethod + rawCompressionMethod <- getWord16le + compressionMethod <- case rawCompressionMethod of + 0 -> return NoCompression + 8 -> return Deflate + _ -> fail $ "Unknown compression method " ++ show rawCompressionMethod skip 2 -- last mod file time skip 2 -- last mod file date skip 4 -- crc32 @@ -780,12 +785,13 @@ getLocalFile = do extraFieldLength <- getWord16le skip (fromIntegral fileNameLength) -- filename skip (fromIntegral extraFieldLength) -- extra field - compressedData <- if bitflag .&. 0O10 == 0 + compressedData <- + if bitflag .&. 0O10 == 0 then getLazyByteString (fromIntegral compressedSize) else -- If bit 3 of general purpose bit flag is set, -- then we need to read until we get to the -- data descriptor record. - do raw <- getCompressedData + do raw <- getCompressedData compressionMethod sig <- lookAhead getWord32le when (sig == 0x08074b50) $ skip 4 skip 4 -- crc32 @@ -793,43 +799,12 @@ getLocalFile = do skip 4 -- uncompressed size if fromIntegral cs == B.length raw then return raw - else fail "Content size mismatch in data descriptor record" + else fail $ printf + ("Content size mismatch in data descriptor record: " + <> "expected %d, got %d bytes") + cs (B.length raw) return (fromIntegral offset, compressedData) --- Move forward over data (not consuming it) until: --- - start of the next local file header --- - start of archive decryption header --- Then back up 12 bytes (the data description record) --- and possibly 4 more bytes --- (conventional but not required sig 0x08074b50 for data description record). -getCompressedData :: Get B.ByteString -getCompressedData = do - numbytes <- lookAhead $ findEnd 0 - getLazyByteString numbytes - where - chunkSize :: Int64 - chunkSize = 16384 - findEnd :: Int64 -> Get Int64 - findEnd n = do - sig <- lookAhead getWord32le - case sig of - 0x08074b50 -> skip 4 >> return n - 0x04034b50 -> -- sig for local file header - return (n - 12) -- rewind past data description - 0x02014b50 -> -- sig for file header - return (n - 12) -- rewind past data description - 0x06054b50 -> -- sig for end of central directory header - return (n - 12) -- rewind past data description - x | x .&. 0xFF == 0x50 -> skip 1 >> findEnd (n + 1) - _ -> do bs <- lookAhead $ getLazyByteString chunkSize - <|> getRemainingLazyByteString - let bsLen = B.length bs - let mbIdx = B.elemIndex 0x50 bs - case mbIdx of - Nothing -> skip (fromIntegral bsLen) >> findEnd (n + bsLen) - Just 0 -> skip 1 >> findEnd (n + 1) - Just idx -> skip (fromIntegral idx) >> findEnd (n + idx) - putLocalFile :: Entry -> Put putLocalFile f = do putWord32le 0x04034b50 @@ -992,3 +967,59 @@ toString = TL.unpack . TL.decodeUtf8 fromString :: String -> B.ByteString fromString = TL.encodeUtf8 . TL.pack + +data DecompressResult = + DecompressSuccess [S.ByteString] B.ByteString + -- chunks in reverse, remainder + | DecompressFailure ZlibInt.DecompressError + +getCompressedData :: CompressionMethod -> Get B.ByteString +getCompressedData NoCompression = do + -- we assume there will be a signature on the data descriptor, + -- otherwise we have no way of identifying where the data ends! + -- The signature 0x08074b50 is commonly used but not required by spec. + let findSigPos = do + w1 <- getWord8 + if w1 == 0x50 + then do + w2 <- getWord8 + if w2 == 0x4b + then do + w3 <- getWord8 + if w3 == 0x07 + then do + w4 <- getWord8 + if w4 == 0x08 + then (\x -> x - 4) <$> bytesRead + else findSigPos + else findSigPos + else findSigPos + else findSigPos + pos <- bytesRead + sigpos <- lookAhead findSigPos <|> + fail "getCompressedData can't find data descriptor signature" + let compressedBytes = sigpos - pos + getLazyByteString compressedBytes +getCompressedData Deflate = do + remainingBytes <- lookAhead getRemainingLazyByteString + let result = ZlibInt.foldDecompressStreamWithInput + (\bs res -> + case res of + DecompressSuccess chunks remainder + -> DecompressSuccess (bs:chunks) remainder + x -> x) + (DecompressSuccess []) + DecompressFailure + (ZlibInt.decompressST ZlibInt.rawFormat + ZlibInt.defaultDecompressParams{ + ZlibInt.decompressAllMembers = False }) + remainingBytes + case result of + DecompressFailure err -> fail (show err) + DecompressSuccess _chunks afterCompressedBytes -> + -- Consume the compressed bytes; we don't do anything with + -- the decompressed chunks. We are just decompressing as a + -- way of finding where the compressed data ends. + getLazyByteString + (fromIntegral (B.length remainingBytes - B.length afterCompressedBytes)) +