Skip to content

Commit

Permalink
util: introduce zlib decoder for targz file
Browse files Browse the repository at this point in the history
Introduce zlib decoder for targz file.

Signed-off-by: Jiang Liu <gerry@linux.alibaba.com>
  • Loading branch information
jiangliu committed Oct 17, 2022
1 parent 0ca9ada commit 981f254
Show file tree
Hide file tree
Showing 2 changed files with 90 additions and 48 deletions.
Binary file added tests/texture/zran/zlib_sample.txt.gz
Binary file not shown.
138 changes: 90 additions & 48 deletions utils/src/compress/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -156,10 +156,80 @@ pub fn decompress(
}
}

/// Stream decoder for zlib/gzip.
pub struct ZlibDecoder<R> {
stream: GzDecoder<BufReader<R>>,
}

impl<R: Read> ZlibDecoder<R> {
pub fn new(reader: R) -> Self {
ZlibDecoder {
stream: GzDecoder::new(BufReader::new(reader)),
}
}
}

impl<R: Read> Read for ZlibDecoder<R> {
fn read(&mut self, buf: &mut [u8]) -> Result<usize> {
self.stream.read(buf)
}
}

/// Estimate the maximum compressed data size from uncompressed data size.
///
/// Gzip is special that it doesn't carry compress_size. We need to read the maximum possible size
/// of compressed data for `chunk_decompress_size`, and try to decompress `chunk_decompress_size`
/// bytes of data out of it.
//
// Per man(1) gzip
// The worst case expansion is a few bytes for the gzip file header, plus 5 bytes every 32K block,
// or an expansion ratio of 0.015% for large files.
//
// Per http://www.zlib.org/rfc-gzip.html#header-trailer, each member has the following structure:
// +---+---+---+---+---+---+---+---+---+---+
// |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
// +---+---+---+---+---+---+---+---+---+---+
// (if FLG.FEXTRA set)
// +---+---+=================================+
// | XLEN |...XLEN bytes of "extra field"...| (more-->)
// +---+---+=================================+
// (if FLG.FNAME set)
// +=========================================+
// |...original file name, zero-terminated...| (more-->)
// +=========================================+
// (if FLG.FCOMMENT set)
// +===================================+
// |...file comment, zero-terminated...| (more-->)
// +===================================+
// (if FLG.FHCRC set)
// +---+---+
// | CRC16 |
// +---+---+
// +=======================+
// |...compressed blocks...| (more-->)
// +=======================+
// 0 1 2 3 4 5 6 7
// +---+---+---+---+---+---+---+---+
// | CRC32 | ISIZE |
// +---+---+---+---+---+---+---+---+
// gzip head+footer is at least 10+8 bytes, stargz header doesn't include any flags
// so it's 18 bytes. Let's read at least 128 bytes more, to allow the decompressor to
// find out end of the gzip stream.
//
// Ideally we should introduce a streaming cache for stargz that maintains internal
// chunks and expose stream APIs.
pub fn compute_compressed_gzip_size(size: usize, max_size: usize) -> usize {
let size = size + 10 + 8 + 5 + (size / (16 << 10)) * 5 + 128;

std::cmp::min(size, max_size)
}

#[cfg(test)]
mod tests {
use super::*;
use std::fs::OpenOptions;
use std::io::{Seek, SeekFrom};
use std::path::Path;
use vmm_sys_util::tempfile::TempFile;

#[test]
Expand Down Expand Up @@ -417,53 +487,25 @@ mod tests {
assert_eq!(sz, 4097);
assert_eq!(buf, decompressed);
}
}

/// Estimate the maximum compressed data size from uncompressed data size.
///
/// Gzip is special that it doesn't carry compress_size. We need to read the maximum possible size
/// of compressed data for `chunk_decompress_size`, and try to decompress `chunk_decompress_size`
/// bytes of data out of it.
//
// Per man(1) gzip
// The worst case expansion is a few bytes for the gzip file header, plus 5 bytes every 32K block,
// or an expansion ratio of 0.015% for large files.
//
// Per http://www.zlib.org/rfc-gzip.html#header-trailer, each member has the following structure:
// +---+---+---+---+---+---+---+---+---+---+
// |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->)
// +---+---+---+---+---+---+---+---+---+---+
// (if FLG.FEXTRA set)
// +---+---+=================================+
// | XLEN |...XLEN bytes of "extra field"...| (more-->)
// +---+---+=================================+
// (if FLG.FNAME set)
// +=========================================+
// |...original file name, zero-terminated...| (more-->)
// +=========================================+
// (if FLG.FCOMMENT set)
// +===================================+
// |...file comment, zero-terminated...| (more-->)
// +===================================+
// (if FLG.FHCRC set)
// +---+---+
// | CRC16 |
// +---+---+
// +=======================+
// |...compressed blocks...| (more-->)
// +=======================+
// 0 1 2 3 4 5 6 7
// +---+---+---+---+---+---+---+---+
// | CRC32 | ISIZE |
// +---+---+---+---+---+---+---+---+
// gzip head+footer is at least 10+8 bytes, stargz header doesn't include any flags
// so it's 18 bytes. Let's read at least 128 bytes more, to allow the decompressor to
// find out end of the gzip stream.
//
// Ideally we should introduce a streaming cache for stargz that maintains internal
// chunks and expose stream APIs.
pub fn compute_compressed_gzip_size(size: usize, max_size: usize) -> usize {
let size = size + 10 + 8 + 5 + (size / (16 << 10)) * 5 + 128;

std::cmp::min(size, max_size)
#[test]
fn test_zlib_decoder() {
let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR");
let path = Path::new(root_dir).join("../tests/texture/zran/zlib_sample.txt.gz");
let file = OpenOptions::new().read(true).open(&path).unwrap();
let mut decoder = ZlibDecoder::new(file);
let mut buf = [0u8; 8];

decoder.read_exact(&mut buf).unwrap();
assert_eq!(&String::from_utf8_lossy(&buf), "This is ");
decoder.read_exact(&mut buf).unwrap();
assert_eq!(&String::from_utf8_lossy(&buf), "a test f");
decoder.read_exact(&mut buf).unwrap();
assert_eq!(&String::from_utf8_lossy(&buf), "ile for ");
let ret = decoder.read(&mut buf).unwrap();
assert_eq!(ret, 6);
assert_eq!(&String::from_utf8_lossy(&buf[0..6]), "zlib.\n");
let ret = decoder.read(&mut buf).unwrap();
assert_eq!(ret, 0);
}
}

0 comments on commit 981f254

Please sign in to comment.