Skip to content

Commit

Permalink
Use streaming decompress to identify extent of compressed data. (#66)
Browse files Browse the repository at this point in the history
This fixes a problem that arises for local files with bit 3
of the general purpose bit flag set. In this case, we don't
get information up front about the size of the compressed
data.  So how do we know where the compressed data ends?
Previously, we tried to determine this by looking for the
signature of the data descriptor. But the data descriptor doesn't
always HAVE a signature, and it is also possible for signatures to
occur accidentally in the compressed data itself (#65).

Here we follow a clue from an Info-ZIP note:
"In general, this feature can only be reliably used
together with compression methods that allow intrinsic
detection of the 'end-of-compressed-data' condition."

We use the streaming decompression interface from
zlib's Internal module.  This tells us, in effect, where
the compressed data ends.

A parameter has been added to getCompressedData for
the compressionMethod, since we only want to do streaming
decompression if the data is compressed with Deflate.

Closes #65.
  • Loading branch information
jgm authored Mar 7, 2024
1 parent fa62708 commit 2abee35
Showing 1 changed file with 71 additions and 40 deletions.
111 changes: 71 additions & 40 deletions src/Codec/Archive/Zip.hs
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,6 @@ import Data.List (partition)
import Data.Maybe (fromJust)
#endif

import GHC.Int (Int64)

-- from bytestring
import qualified Data.ByteString as S
import qualified Data.ByteString.Lazy as B
Expand All @@ -114,8 +112,11 @@ import qualified Data.Text.Lazy.Encoding as TL

-- from zlib
import qualified Codec.Compression.Zlib.Raw as Zlib
import qualified Codec.Compression.Zlib.Internal as ZlibInt
import System.IO.Error (isAlreadyExistsError)

-- import Debug.Trace

manySig :: Word32 -> Get a -> Get [a]
manySig sig p = do
sig' <- lookAhead getWord32le
Expand Down Expand Up @@ -768,7 +769,11 @@ getLocalFile = do
getWord32le >>= ensure (== 0x04034b50)
skip 2 -- version
bitflag <- getWord16le
skip 2 -- compressionMethod
rawCompressionMethod <- getWord16le
compressionMethod <- case rawCompressionMethod of
0 -> return NoCompression
8 -> return Deflate
_ -> fail $ "Unknown compression method " ++ show rawCompressionMethod
skip 2 -- last mod file time
skip 2 -- last mod file date
skip 4 -- crc32
Expand All @@ -780,56 +785,26 @@ getLocalFile = do
extraFieldLength <- getWord16le
skip (fromIntegral fileNameLength) -- filename
skip (fromIntegral extraFieldLength) -- extra field
compressedData <- if bitflag .&. 0O10 == 0
compressedData <-
if bitflag .&. 0O10 == 0
then getLazyByteString (fromIntegral compressedSize)
else -- If bit 3 of general purpose bit flag is set,
-- then we need to read until we get to the
-- data descriptor record.
do raw <- getCompressedData
do raw <- getCompressedData compressionMethod
sig <- lookAhead getWord32le
when (sig == 0x08074b50) $ skip 4
skip 4 -- crc32
cs <- getWord32le -- compressed size
skip 4 -- uncompressed size
if fromIntegral cs == B.length raw
then return raw
else fail "Content size mismatch in data descriptor record"
else fail $ printf
("Content size mismatch in data descriptor record: "
<> "expected %d, got %d bytes")
cs (B.length raw)
return (fromIntegral offset, compressedData)

-- Move forward over data (not consuming it) until:
-- - start of the next local file header
-- - start of archive decryption header
-- Then back up 12 bytes (the data description record)
-- and possibly 4 more bytes
-- (conventional but not required sig 0x08074b50 for data description record).
getCompressedData :: Get B.ByteString
getCompressedData = do
numbytes <- lookAhead $ findEnd 0
getLazyByteString numbytes
where
chunkSize :: Int64
chunkSize = 16384
findEnd :: Int64 -> Get Int64
findEnd n = do
sig <- lookAhead getWord32le
case sig of
0x08074b50 -> skip 4 >> return n
0x04034b50 -> -- sig for local file header
return (n - 12) -- rewind past data description
0x02014b50 -> -- sig for file header
return (n - 12) -- rewind past data description
0x06054b50 -> -- sig for end of central directory header
return (n - 12) -- rewind past data description
x | x .&. 0xFF == 0x50 -> skip 1 >> findEnd (n + 1)
_ -> do bs <- lookAhead $ getLazyByteString chunkSize
<|> getRemainingLazyByteString
let bsLen = B.length bs
let mbIdx = B.elemIndex 0x50 bs
case mbIdx of
Nothing -> skip (fromIntegral bsLen) >> findEnd (n + bsLen)
Just 0 -> skip 1 >> findEnd (n + 1)
Just idx -> skip (fromIntegral idx) >> findEnd (n + idx)

putLocalFile :: Entry -> Put
putLocalFile f = do
putWord32le 0x04034b50
Expand Down Expand Up @@ -992,3 +967,59 @@ toString = TL.unpack . TL.decodeUtf8

fromString :: String -> B.ByteString
fromString = TL.encodeUtf8 . TL.pack

data DecompressResult =
DecompressSuccess [S.ByteString] B.ByteString
-- chunks in reverse, remainder
| DecompressFailure ZlibInt.DecompressError

getCompressedData :: CompressionMethod -> Get B.ByteString
getCompressedData NoCompression = do
-- we assume there will be a signature on the data descriptor,
-- otherwise we have no way of identifying where the data ends!
-- The signature 0x08074b50 is commonly used but not required by spec.
let findSigPos = do
w1 <- getWord8
if w1 == 0x50
then do
w2 <- getWord8
if w2 == 0x4b
then do
w3 <- getWord8
if w3 == 0x07
then do
w4 <- getWord8
if w4 == 0x08
then (\x -> x - 4) <$> bytesRead
else findSigPos
else findSigPos
else findSigPos
else findSigPos
pos <- bytesRead
sigpos <- lookAhead findSigPos <|>
fail "getCompressedData can't find data descriptor signature"
let compressedBytes = sigpos - pos
getLazyByteString compressedBytes
getCompressedData Deflate = do
remainingBytes <- lookAhead getRemainingLazyByteString
let result = ZlibInt.foldDecompressStreamWithInput
(\bs res ->
case res of
DecompressSuccess chunks remainder
-> DecompressSuccess (bs:chunks) remainder
x -> x)
(DecompressSuccess [])
DecompressFailure
(ZlibInt.decompressST ZlibInt.rawFormat
ZlibInt.defaultDecompressParams{
ZlibInt.decompressAllMembers = False })
remainingBytes
case result of
DecompressFailure err -> fail (show err)
DecompressSuccess _chunks afterCompressedBytes ->
-- Consume the compressed bytes; we don't do anything with
-- the decompressed chunks. We are just decompressing as a
-- way of finding where the compressed data ends.
getLazyByteString
(fromIntegral (B.length remainingBytes - B.length afterCompressedBytes))

0 comments on commit 2abee35

Please sign in to comment.