diff --git a/Cargo.lock b/Cargo.lock index 10095b8a719..27841520106 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -305,6 +305,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f82b0f4c27ad9f8bfd1f3208d882da2b09c301bc1c828fd3a00d0216d2fbbff6" dependencies = [ "crc32fast", + "libz-sys", "miniz_oxide", ] @@ -734,6 +735,18 @@ version = "0.2.126" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "349d5a591cd28b49e1d1037471617a32ddcda5731b99419008085f72d5a53836" +[[package]] +name = "libz-sys" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9702761c3935f8cc2f101793272e202c72b99da8f4224a19ddcf1279a6450bbf" +dependencies = [ + "cc", + "libc", + "pkg-config", + "vcpkg", +] + [[package]] name = "lock_api" version = "0.4.7" @@ -1038,6 +1051,7 @@ dependencies = [ "flate2", "lazy_static", "libc", + "libz-sys", "log", "lz4-sys", "nix", @@ -1045,6 +1059,7 @@ dependencies = [ "serde", "serde_json", "sha2", + "tar", "tokio", "vmm-sys-util", "zstd", diff --git a/tests/texture/zran/zlib_sample.txt.gz b/tests/texture/zran/zlib_sample.txt.gz new file mode 100644 index 00000000000..bb1f1e41ac3 Binary files /dev/null and b/tests/texture/zran/zlib_sample.txt.gz differ diff --git a/tests/texture/zran/zran-single-stream.tar.gz b/tests/texture/zran/zran-single-stream.tar.gz new file mode 100644 index 00000000000..5866d79f5c9 Binary files /dev/null and b/tests/texture/zran/zran-single-stream.tar.gz differ diff --git a/tests/texture/zran/zran-two-streams.tar.gz b/tests/texture/zran/zran-two-streams.tar.gz new file mode 100644 index 00000000000..e9e1f75a5f4 Binary files /dev/null and b/tests/texture/zran/zran-two-streams.tar.gz differ diff --git a/tests/texture/zran/zran-zero-file.tar.gz b/tests/texture/zran/zran-zero-file.tar.gz new file mode 100644 index 00000000000..8fd1f2b2d90 Binary files /dev/null and b/tests/texture/zran/zran-zero-file.tar.gz differ diff --git a/utils/Cargo.toml b/utils/Cargo.toml index 4c901e60178..e5329ffc6e0 100644 --- a/utils/Cargo.toml +++ b/utils/Cargo.toml @@ -10,9 +10,10 @@ edition = "2018" [dependencies] blake3 = "1.3" -flate2 = { version = "1.0", features = ["miniz-sys"], default-features = false } +flate2 = { version = "1.0", features = ["zlib"], default-features = false } lazy_static = "1.4" libc = "0.2" +libz-sys = { version = "1.1.8", optional = true } log = "0.4" lz4-sys = "1.9.4" serde = { version = ">=1.0.27", features = ["serde_derive", "rc"] } @@ -26,6 +27,10 @@ nydus-error = { version = "0.2", path = "../error" } [dev-dependencies] vmm-sys-util = ">=0.9.0" +tar = "0.4.38" + +[features] +zran = ["libz-sys"] [package.metadata.docs.rs] all-features = true diff --git a/utils/src/compress/mod.rs b/utils/src/compress/mod.rs index 92f51fd1238..87fc1853e41 100644 --- a/utils/src/compress/mod.rs +++ b/utils/src/compress/mod.rs @@ -19,8 +19,12 @@ use self::lz4_standard::*; mod zstd_standard; use self::zstd_standard::*; +#[cfg(feature = "zran")] +pub mod zlib_random; + const COMPRESSION_MINIMUM_RATIO: usize = 100; +/// Supported compression algorithms. #[derive(Clone, Copy, Debug, Eq, Hash, PartialEq)] pub enum Algorithm { None, @@ -92,18 +96,13 @@ impl TryFrom for Algorithm { } impl Algorithm { + /// Check whether the compression algorithm is none. pub fn is_none(self) -> bool { self == Self::None } } -// Algorithm::LZ4Block: -// 1. Default ratio -// 2. No prepend size - -// For compatibility reason, we use liblz4 version to compress/decompress directly -// with data blocks so that we don't really care about lz4 header magic numbers like -// as being done with all these rust lz4 implementations +/// Compress data with the specified compression algorithm. pub fn compress(src: &[u8], algorithm: Algorithm) -> Result<(Cow<[u8]>, bool)> { let src_size = src.len(); if src_size == 0 { @@ -157,10 +156,80 @@ pub fn decompress( } } +/// Stream decoder for zlib/gzip. +pub struct ZlibDecoder { + stream: GzDecoder>, +} + +impl ZlibDecoder { + pub fn new(reader: R) -> Self { + ZlibDecoder { + stream: GzDecoder::new(BufReader::new(reader)), + } + } +} + +impl Read for ZlibDecoder { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.stream.read(buf) + } +} + +/// Estimate the maximum compressed data size from uncompressed data size. +/// +/// Gzip is special that it doesn't carry compress_size. We need to read the maximum possible size +/// of compressed data for `chunk_decompress_size`, and try to decompress `chunk_decompress_size` +/// bytes of data out of it. +// +// Per man(1) gzip +// The worst case expansion is a few bytes for the gzip file header, plus 5 bytes every 32K block, +// or an expansion ratio of 0.015% for large files. +// +// Per http://www.zlib.org/rfc-gzip.html#header-trailer, each member has the following structure: +// +---+---+---+---+---+---+---+---+---+---+ +// |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) +// +---+---+---+---+---+---+---+---+---+---+ +// (if FLG.FEXTRA set) +// +---+---+=================================+ +// | XLEN |...XLEN bytes of "extra field"...| (more-->) +// +---+---+=================================+ +// (if FLG.FNAME set) +// +=========================================+ +// |...original file name, zero-terminated...| (more-->) +// +=========================================+ +// (if FLG.FCOMMENT set) +// +===================================+ +// |...file comment, zero-terminated...| (more-->) +// +===================================+ +// (if FLG.FHCRC set) +// +---+---+ +// | CRC16 | +// +---+---+ +// +=======================+ +// |...compressed blocks...| (more-->) +// +=======================+ +// 0 1 2 3 4 5 6 7 +// +---+---+---+---+---+---+---+---+ +// | CRC32 | ISIZE | +// +---+---+---+---+---+---+---+---+ +// gzip head+footer is at least 10+8 bytes, stargz header doesn't include any flags +// so it's 18 bytes. Let's read at least 128 bytes more, to allow the decompressor to +// find out end of the gzip stream. +// +// Ideally we should introduce a streaming cache for stargz that maintains internal +// chunks and expose stream APIs. +pub fn compute_compressed_gzip_size(size: usize, max_size: usize) -> usize { + let size = size + 10 + 8 + 5 + (size / (16 << 10)) * 5 + 128; + + std::cmp::min(size, max_size) +} + #[cfg(test)] mod tests { use super::*; + use std::fs::OpenOptions; use std::io::{Seek, SeekFrom}; + use std::path::Path; use vmm_sys_util::tempfile::TempFile; #[test] @@ -418,53 +487,25 @@ mod tests { assert_eq!(sz, 4097); assert_eq!(buf, decompressed); } -} -/// Estimate the maximum compressed data size from uncompressed data size. -/// -/// Gzip is special that it doesn't carry compress_size. We need to read the maximum possible size -/// of compressed data for `chunk_decompress_size`, and try to decompress `chunk_decompress_size` -/// bytes of data out of it. -// -// Per man(1) gzip -// The worst case expansion is a few bytes for the gzip file header, plus 5 bytes every 32K block, -// or an expansion ratio of 0.015% for large files. -// -// Per http://www.zlib.org/rfc-gzip.html#header-trailer, each member has the following structure: -// +---+---+---+---+---+---+---+---+---+---+ -// |ID1|ID2|CM |FLG| MTIME |XFL|OS | (more-->) -// +---+---+---+---+---+---+---+---+---+---+ -// (if FLG.FEXTRA set) -// +---+---+=================================+ -// | XLEN |...XLEN bytes of "extra field"...| (more-->) -// +---+---+=================================+ -// (if FLG.FNAME set) -// +=========================================+ -// |...original file name, zero-terminated...| (more-->) -// +=========================================+ -// (if FLG.FCOMMENT set) -// +===================================+ -// |...file comment, zero-terminated...| (more-->) -// +===================================+ -// (if FLG.FHCRC set) -// +---+---+ -// | CRC16 | -// +---+---+ -// +=======================+ -// |...compressed blocks...| (more-->) -// +=======================+ -// 0 1 2 3 4 5 6 7 -// +---+---+---+---+---+---+---+---+ -// | CRC32 | ISIZE | -// +---+---+---+---+---+---+---+---+ -// gzip head+footer is at least 10+8 bytes, stargz header doesn't include any flags -// so it's 18 bytes. Let's read at least 128 bytes more, to allow the decompressor to -// find out end of the gzip stream. -// -// Ideally we should introduce a streaming cache for stargz that maintains internal -// chunks and expose stream APIs. -pub fn compute_compressed_gzip_size(size: usize, max_size: usize) -> usize { - let size = size + 10 + 8 + 5 + (size / (16 << 10)) * 5 + 128; - - std::cmp::min(size, max_size) + #[test] + fn test_zlib_decoder() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = Path::new(root_dir).join("../tests/texture/zran/zlib_sample.txt.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + let mut decoder = ZlibDecoder::new(file); + let mut buf = [0u8; 8]; + + decoder.read_exact(&mut buf).unwrap(); + assert_eq!(&String::from_utf8_lossy(&buf), "This is "); + decoder.read_exact(&mut buf).unwrap(); + assert_eq!(&String::from_utf8_lossy(&buf), "a test f"); + decoder.read_exact(&mut buf).unwrap(); + assert_eq!(&String::from_utf8_lossy(&buf), "ile for "); + let ret = decoder.read(&mut buf).unwrap(); + assert_eq!(ret, 6); + assert_eq!(&String::from_utf8_lossy(&buf[0..6]), "zlib.\n"); + let ret = decoder.read(&mut buf).unwrap(); + assert_eq!(ret, 0); + } } diff --git a/utils/src/compress/zlib_random.rs b/utils/src/compress/zlib_random.rs new file mode 100644 index 00000000000..9ff2f627f48 --- /dev/null +++ b/utils/src/compress/zlib_random.rs @@ -0,0 +1,839 @@ +// Copyright (C) 2022 Alibaba Cloud. All rights reserved. +// +// SPDX-License-Identifier: Apache-2.0 + +//! Generate context information to randomly access gzip/zlib stream. + +use std::alloc::{self, Layout}; +use std::convert::TryFrom; +use std::io::{Read, Result}; +use std::ops::DerefMut; +use std::os::raw::{c_char, c_int, c_void}; +use std::sync::{Arc, Mutex}; +use std::{mem, ptr}; + +use libz_sys::{ + inflate, inflateEnd, inflateInit2_, inflatePrime, inflateReset, inflateSetDictionary, uInt, + z_stream, Z_BLOCK, Z_BUF_ERROR, Z_OK, Z_STREAM_END, +}; + +/// Size of inflate dictionary to support random access. +pub const ZRAN_DICT_WIN_SIZE: usize = 1 << 15; +/// Maximum number of random access slices per compression object. +pub const ZRAN_MAX_CI_ENTRIES: usize = 1 << 24; + +const ZRAN_READER_BUF_SIZE: usize = 64 * 1024; +const ZRAN_MIN_COMP_SIZE: u64 = 768 * 1024; +const ZRAN_MAX_COMP_SIZE: u64 = 2048 * 1024; +const ZRAN_MAX_UNCOMP_SIZE: u64 = 2048 * 1024; +const ZLIB_ALIGN: usize = std::mem::align_of::(); +const ZLIB_VERSION: &'static str = "1.2.8\0"; + +/// Information to retrieve a data chunk from an associated random access slice. +#[derive(Debug, Eq, PartialEq)] +pub struct ZranChunkInfo { + /// Index into the inflate context array for the associated inflate context. + pub ci_index: u32, + /// Offset to get data chunk from the uncompressed content. + pub ci_offset: u32, + /// Size of the uncompressed chunk data. + pub ci_len: u32, +} + +/// Context information to decode data from a random access slice. +pub struct ZranContext { + /// Offset in the original compression data stream. + pub in_offset: u64, + /// Offset in the uncompression data stream. + pub out_offset: u64, + /// Size of original compressed data. + pub in_len: u32, + /// Size of uncompressed data. + pub out_len: u32, + /// Optional previous byte in the original compressed data stream, used when `ctx_bits` is non-zero. + pub ctx_byte: u8, + /// Bits from previous byte to feeds into the inflate context for random access. + pub ctx_bits: u8, + /// Inflate dictionary for random access. + pub dict: Vec, +} + +impl ZranContext { + fn new(info: &ZranCompInfo, dict: Vec) -> Self { + ZranContext { + in_offset: info.in_pos, + out_offset: info.out_pos, + in_len: 0, + out_len: 0, + ctx_byte: info.previous_byte, + ctx_bits: info.pending_bits, + dict, + } + } +} + +/// Gzip/zlib decoder to randomly uncompress Gzip/zlib stream. +pub struct ZranDecoder { + stream: ZranStream, +} + +impl ZranDecoder { + /// Create a new instance of `ZranDecoder`. + pub fn new() -> Result { + let stream = ZranStream::new(true)?; + Ok(Self { stream }) + } + + /// Uncompress gzip/zlib compressed data chunk. + /// + /// # Arguments + /// - ctx: context to random access compressed stream. + /// - dict: use this dictionary instead of `ctx.dict` to decode data + /// - input: input compressed data stream + /// - output: buffer to receive uncompressed data + pub fn uncompress( + &mut self, + ctx: &ZranContext, + dict: Option<&[u8]>, + input: &[u8], + output: &mut [u8], + ) -> Result { + if input.len() != ctx.in_len as usize { + return Err(einval!("size of input buffer doesn't match")); + } else if ctx.out_len as usize > output.len() { + return Err(einval!("buffer to receive decompressed data is too small")); + } + + self.stream.reset()?; + if ctx.ctx_bits != 0 { + let bits = ctx.ctx_bits & 0x7; + self.stream.set_prime(bits, ctx.ctx_byte)?; + } + let dict = dict.unwrap_or(ctx.dict.as_slice()); + self.stream.set_dict(dict)?; + + self.stream.set_next_in(&input); + self.stream.set_avail_in(ctx.in_len as uInt); + self.stream.set_next_out(output); + self.stream.set_avail_out(ctx.out_len as uInt); + let ret = self.stream.inflate(true); + match ret { + Z_OK => { + let count = self.stream.next_out() as usize - output.as_ptr() as usize; + if count != ctx.out_len as usize { + Err(eio!("failed to decode data from stream, size mismatch")) + } else { + Ok(count) + } + } + _ => Err(eio!("failed to decode data from compressed data stream")), + } + } +} + +/// Struct to generate random access information for OCIv1 image tarballs. +/// +/// `ZranGenerator` generates decompression context information to support random access to the +/// tarball later. It only tracks information related to Tar file content, and ignores all other +/// tar headers and zlib headers when possible. The work flow is: +/// 1) create a `ZranGenerator` object `zran`. +/// 2) create a tar::Archive object from `zran`. +/// 3) walk all entries in the tarball, for each tar regular file: +/// 3.1) get file size and split it into chunks, for each file data chunk +/// 3.2) call zran.begin_data_chunk() +/// 3.3) read file content from the tar Entry object +/// 3.4) call zran.end_data_chunk() to get chunk decompression information +/// 4) call zran.get_compression_info_array() to get all decompression context information for +/// random access later +pub struct ZranGenerator { + reader: ZranReader, + min_comp_size: u64, + max_comp_size: u64, + max_uncomp_size: u64, + curr_block_start: u64, + curr_ci_offset: u64, + curr_ci_idx: Option, + ci_array: Vec, +} + +impl ZranGenerator { + /// Create a new instance of `ZranGenerator` from a reader. + pub fn new(reader: ZranReader) -> Self { + Self { + reader, + min_comp_size: ZRAN_MIN_COMP_SIZE, + max_comp_size: ZRAN_MAX_COMP_SIZE, + max_uncomp_size: ZRAN_MAX_UNCOMP_SIZE, + curr_block_start: 0, + curr_ci_offset: 0, + curr_ci_idx: None, + ci_array: Vec::new(), + } + } + + /// Begin a transaction to read data from the zlib stream. + /// + /// # Arguments + /// - `chunk_size`: size of data to be read from the zlib stream. + pub fn begin_read(&mut self, chunk_size: u64) -> Result { + let info = self.reader.get_current_ctx_info(); + let ci_idx = if let Some(idx) = self.curr_ci_idx { + let ctx = &self.ci_array[idx]; + let comp_size = info.in_pos - ctx.in_offset; + let uncomp_size = info.out_pos - ctx.out_offset; + let first = self.is_first_block(); + let enough = !first + && (comp_size >= self.max_comp_size / 2 + || uncomp_size + chunk_size >= self.max_uncomp_size); + if info.stream_switched != 0 || enough { + // The slice becomes too big after merging current data chunk. + self.new_ci_entry()? + } else if !first + && comp_size > 2 * ctx.in_len as u64 + && ctx.in_len as u64 > self.min_comp_size + { + // The gap between current chunk and last chunk is too big. + self.new_ci_entry()? + } else { + idx + } + } else { + self.new_ci_entry()? + }; + + if ci_idx > ZRAN_MAX_CI_ENTRIES { + Err(einval!("too many compression information entries")) + } else { + self.curr_ci_idx = Some(ci_idx); + self.curr_ci_offset = info.out_pos; + Ok(ci_idx as u32) + } + } + + /// Mark end of a data read operation and returns information to decode data from the random + /// access slice. + pub fn end_read(&mut self) -> Result { + let info = self.reader.get_current_ctx_info(); + if let Some(idx) = self.curr_ci_idx { + let ctx = &mut self.ci_array[idx]; + let comp_size = info.in_pos - ctx.in_offset; + let uncomp_size = info.out_pos - ctx.out_offset; + /* + if uncomp_size > self.max_uncomp_size || comp_size > self.max_comp_size { + return Err(einval!("uncompressed data size is too big")); + } + */ + + let ci = ZranChunkInfo { + ci_index: idx as u32, + ci_offset: (self.curr_ci_offset - ctx.out_offset) as u32, + ci_len: (info.out_pos - self.curr_ci_offset) as u32, + }; + ctx.out_len = uncomp_size as u32; + ctx.in_len = comp_size as u32; + Ok(ci) + } else { + Err(einval!("invalid compression state")) + } + } + + /// Get an immutable reference to the random access context information array. + pub fn get_compression_ctx_array(&mut self) -> &[ZranContext] { + self.curr_ci_idx = None; + &self.ci_array + } + + /// Set minimal compressed size to emit an random access slice. + /// + /// Please ensure "min_compressed_size * 2 <= max_compressed_size". + pub fn set_min_compressed_size(&mut self, sz: u64) { + self.min_comp_size = sz; + } + + /// Set maximum compressed size to emit an random access slice. + /// + /// Please ensure "min_compressed_size * 2 <= max_compressed_size". + pub fn set_max_compressed_size(&mut self, sz: u64) { + self.max_comp_size = sz; + } + + /// Set maximum uncompressed size to emit an random access slice. + /// + /// Please ensure "min_compressed_size * 2 < max_compressed_size". + pub fn set_max_uncompressed_size(&mut self, sz: u64) { + self.max_uncomp_size = sz; + } + + fn new_ci_entry(&mut self) -> Result { + let info = self.reader.get_block_ctx_info(); + let dict = self.reader.get_block_ctx_dict(); + self.ci_array.push(ZranContext::new(&info, dict)); + self.curr_block_start = info.in_pos; + Ok(self.ci_array.len() - 1) + } + + fn is_first_block(&self) -> bool { + let info = self.reader.get_block_ctx_info(); + info.in_pos == self.curr_block_start + } +} + +/// A specialized gzip reader for OCI image tarballs. +/// +/// This reader assumes that the compressed file is a tar file, and restricts access patterns. +pub struct ZranReader { + inner: Arc>>, +} + +impl ZranReader { + /// Create a `ZranReader` from a reader. + pub fn new(reader: R) -> Result { + let inner = ZranReaderState::new(reader)?; + Ok(Self { + inner: Arc::new(Mutex::new(inner)), + }) + } + + /// Get inflate context information for current inflate position. + fn get_current_ctx_info(&self) -> ZranCompInfo { + self.inner.lock().unwrap().get_compression_info() + } + + /// Get inflate context information for current inflate block. + fn get_block_ctx_info(&self) -> ZranCompInfo { + self.inner.lock().unwrap().block_ctx_info + } + + /// Get inflate dictionary for current inflate block. + fn get_block_ctx_dict(&self) -> Vec { + let state = self.inner.lock().unwrap(); + state.block_ctx_dict[..state.block_ctx_dict_size].to_vec() + } +} + +impl Read for ZranReader { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.inner.lock().unwrap().read(buf) + } +} + +impl Clone for ZranReader { + fn clone(&self) -> Self { + Self { + inner: self.inner.clone(), + } + } +} + +#[derive(Clone, Copy, Debug, Default, Eq, PartialEq)] +struct ZranCompInfo { + in_pos: u64, + out_pos: u64, + flags: u32, + previous_byte: u8, + pending_bits: u8, + stream_switched: u8, +} + +struct ZranReaderState { + stream: ZranStream, + input: Vec, + reader: R, + block_ctx_info: ZranCompInfo, + block_ctx_dict: Vec, + block_ctx_dict_size: usize, + stream_switched: u8, +} + +impl ZranReaderState { + fn new(reader: R) -> Result { + let mut stream = ZranStream::new(false)?; + let mut input = vec![0u8; ZRAN_READER_BUF_SIZE]; + stream.set_next_in(&mut input); + stream.set_avail_in(0); + + Ok(ZranReaderState { + stream, + input, + reader, + block_ctx_info: ZranCompInfo::default(), + block_ctx_dict: vec![0u8; ZRAN_DICT_WIN_SIZE], + block_ctx_dict_size: 0, + stream_switched: 0, + }) + } + + /// Get decompression information about the stream. + fn get_compression_info(&mut self) -> ZranCompInfo { + let stream_switched = self.stream_switched; + self.stream_switched = 0; + self.stream + .get_compression_info(&self.input, stream_switched) + } + + fn get_compression_dict(&mut self) -> Result<()> { + self.block_ctx_dict_size = self.stream.get_compression_dict(&mut self.block_ctx_dict)?; + Ok(()) + } +} + +impl Read for ZranReaderState { + fn read(&mut self, buf: &mut [u8]) -> Result { + self.stream.set_next_out(buf); + self.stream.set_avail_out(buf.len() as u32); + + loop { + // Reload the input buffer when needed. + if self.stream.avail_in() == 0 { + let sz = self.reader.read(self.input.as_mut_slice())?; + if sz == 0 { + return Ok(0); + } + self.stream.set_next_in(&self.input); + self.stream.set_avail_in(sz as u32); + } + + match self.stream.inflate(false) { + Z_STREAM_END => { + self.stream.reset()?; + self.stream_switched = 1; + continue; + } + Z_OK => { + let count = self.stream.next_out() as usize - buf.as_ptr() as usize; + let info = self.get_compression_info(); + if info.flags & 0x80 != 0 { + self.get_compression_dict()?; + self.block_ctx_info = info; + } + if count == 0 { + // zlib/gzip compression header, continue for next data block. + continue; + } else { + return Ok(count); + } + } + Z_BUF_ERROR => { + if self.stream.avail_in() == 0 { + // Need more input data, continue to feed data into the input buffer. + continue; + } else { + return Err(eio!("failed to decode data from compressed data stream")); + } + } + _ => { + return Err(eio!("failed to decode data from compressed data stream")); + } + } + } + } +} + +struct ZranStream { + stream: Box, + total_in: u64, + total_out: u64, +} + +impl ZranStream { + fn new(decode: bool) -> Result { + let mut stream = Box::new(z_stream { + next_in: ptr::null_mut(), + avail_in: 0, + total_in: 0, + next_out: ptr::null_mut(), + avail_out: 0, + total_out: 0, + msg: ptr::null_mut(), + adler: 0, + data_type: 0, + reserved: 0, + opaque: ptr::null_mut(), + state: ptr::null_mut(), + zalloc, + zfree, + }); + // windowBits can also be greater than 15 for optional gzip decoding. + // Add 32 to windowBits to enable zlib and gzip decoding with automatic header detection, + // or add 16 to decode only the gzip format (the zlib format will return a Z_DATA_ERROR). + // -15 means raw mode. + let mode = if decode { -15 } else { 31 }; + let ret = unsafe { + inflateInit2_( + stream.deref_mut() as *mut z_stream, + mode, + ZLIB_VERSION.as_ptr() as *const c_char, + mem::size_of::() as c_int, + ) + }; + if ret != Z_OK { + return Err(einval!("failed to initialize zlib inflate context")); + } + + Ok(Self { + stream, + total_in: 0, + total_out: 0, + }) + } + + fn inflate(&mut self, decode: bool) -> i32 { + // Z_BLOCK requests that inflate() stop if and when it gets to the next deflate block + // boundary. When decoding the zlib or gzip format, this will cause inflate() to return + // immediately after the header and before the first block. When doing a raw inflate, + // inflate() will go ahead and process the first block, and will return when it gets to + // the end of that block, or when it runs out of data. + let mode = if decode { 0 } else { Z_BLOCK }; + self.total_in += self.stream.avail_in as u64; + self.total_out += self.stream.avail_out as u64; + let ret = unsafe { inflate(self.stream.deref_mut() as *mut z_stream, mode) }; + self.total_in -= self.stream.avail_in as u64; + self.total_out -= self.stream.avail_out as u64; + ret + } + + fn reset(&mut self) -> Result<()> { + let ret = unsafe { inflateReset(self.stream.deref_mut() as *mut z_stream) }; + if ret != Z_OK { + return Err(einval!("failed to reset zlib inflate context")); + } + Ok(()) + } + + fn get_compression_info(&mut self, buf: &[u8], stream_switched: u8) -> ZranCompInfo { + let previous_byte = if self.stream.data_type & 0x3f != 0 { + assert!(self.stream.next_in as usize > buf.as_ptr() as usize); + unsafe { *self.stream.next_in.sub(1) } + } else { + 0 + }; + ZranCompInfo { + in_pos: self.total_in, + out_pos: self.total_out, + flags: self.stream.data_type as u32, + previous_byte, + pending_bits: self.stream.data_type as u8 & 0x3f, + stream_switched, + } + } + + fn get_compression_dict(&mut self, buf: &mut [u8]) -> Result { + let mut len: uInt = 0; + assert_eq!(buf.len(), ZRAN_DICT_WIN_SIZE); + let ret = unsafe { + inflateGetDictionary( + self.stream.deref_mut() as *mut z_stream, + buf.as_mut_ptr(), + &mut len as *mut uInt, + ) + }; + if ret != Z_OK { + Err(einval!("failed to get inflate dictionary")) + } else { + Ok(len as usize) + } + } + + fn set_dict(&mut self, dict: &[u8]) -> Result<()> { + let ret = unsafe { + inflateSetDictionary(self.stream.deref_mut(), dict.as_ptr(), dict.len() as uInt) + }; + if ret != Z_OK { + return Err(einval!("failed to reset zlib inflate context")); + } + Ok(()) + } + + fn set_prime(&mut self, bits: u8, prime: u8) -> Result<()> { + let ret = unsafe { + inflatePrime( + self.stream.deref_mut(), + bits as c_int, + prime as c_int >> (8 - bits), + ) + }; + if ret != Z_OK { + return Err(einval!("failed to reset zlib inflate context")); + } + Ok(()) + } + + fn set_next_in(&mut self, buf: &[u8]) { + self.stream.next_in = buf.as_ptr() as *mut u8; + } + + fn avail_in(&self) -> u32 { + self.stream.avail_in + } + + fn set_avail_in(&mut self, avail_in: u32) { + self.stream.avail_in = avail_in; + } + + fn next_out(&self) -> *mut u8 { + self.stream.next_out + } + + fn set_next_out(&mut self, buf: &mut [u8]) { + self.stream.next_out = buf.as_mut_ptr(); + } + + fn set_avail_out(&mut self, avail_out: u32) { + self.stream.avail_out = avail_out; + } +} + +impl Drop for ZranStream { + fn drop(&mut self) { + unsafe { inflateEnd(self.stream.deref_mut() as *mut z_stream) }; + } +} + +// Code from https://github.com/rust-lang/flate2-rs/blob/main/src/ffi/c.rs with modification. +fn align_up(size: usize, align: usize) -> usize { + (size + align - 1) & !(align - 1) +} + +#[allow(unused)] +extern "C" fn zalloc(_ptr: *mut c_void, items: uInt, item_size: uInt) -> *mut c_void { + // We need to multiply `items` and `item_size` to get the actual desired + // allocation size. Since `zfree` doesn't receive a size argument we + // also need to allocate space for a `usize` as a header so we can store + // how large the allocation is to deallocate later. + let size = match items + .checked_mul(item_size) + .and_then(|i| usize::try_from(i).ok()) + .map(|size| align_up(size, ZLIB_ALIGN)) + .and_then(|i| i.checked_add(std::mem::size_of::())) + { + Some(i) => i, + None => return ptr::null_mut(), + }; + + // Make sure the `size` isn't too big to fail `Layout`'s restrictions + let layout = match Layout::from_size_align(size, ZLIB_ALIGN) { + Ok(layout) => layout, + Err(_) => return ptr::null_mut(), + }; + + unsafe { + // Allocate the data, and if successful store the size we allocated + // at the beginning and then return an offset pointer. + let ptr = alloc::alloc(layout) as *mut usize; + if ptr.is_null() { + return ptr as *mut c_void; + } + *ptr = size; + ptr.add(1) as *mut c_void + } +} + +#[allow(unused)] +extern "C" fn zfree(_ptr: *mut c_void, address: *mut c_void) { + unsafe { + // Move our address being freed back one pointer, read the size we + // stored in `zalloc`, and then free it using the standard Rust + // allocator. + let ptr = (address as *mut usize).offset(-1); + let size = *ptr; + let layout = Layout::from_size_align_unchecked(size, ZLIB_ALIGN); + alloc::dealloc(ptr as *mut u8, layout) + } +} + +extern "system" { + pub fn inflateGetDictionary( + strm: *mut z_stream, + dictionary: *mut u8, + dictLength: *mut uInt, + ) -> c_int; +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs::OpenOptions; + use std::io::{Seek, SeekFrom}; + use std::path::PathBuf; + use tar::{Archive, EntryType}; + + #[test] + fn test_parse_single_gzip_object() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-single-stream.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + + let mut files = 0; + let mut objects = 0; + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader); + let entries = tar.entries().unwrap(); + for entry in entries { + let entry = entry.unwrap(); + objects += 1; + if entry.header().entry_type() == EntryType::Regular { + files += 1; + } + } + + assert_eq!(objects, 7); + assert_eq!(files, 3); + } + + #[test] + fn test_parse_first_gzip_object() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-two-streams.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + + let mut files = 0; + let mut objects = 0; + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader); + + let entries = tar.entries().unwrap(); + for entry in entries { + let entry = entry.unwrap(); + objects += 1; + if entry.header().entry_type() == EntryType::Regular { + files += 1; + } + } + + assert_eq!(objects, 7); + assert_eq!(files, 3); + } + + #[test] + fn test_parse_two_gzip_objects() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-two-streams.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + + let mut files = 0; + let mut objects = 0; + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader); + tar.set_ignore_zeros(true); + + let entries = tar.entries().unwrap(); + for entry in entries { + let entry = entry.unwrap(); + objects += 1; + if entry.header().entry_type() == EntryType::Regular { + files += 1; + } + } + + assert_eq!(objects, 10); + assert_eq!(files, 5); + } + + #[test] + fn test_parse_gzip_with_big_zero() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-zero-file.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader.clone()); + let entries = tar.entries().unwrap(); + + let mut last: Option = None; + for entry in entries { + let mut entry = entry.unwrap(); + assert_eq!(entry.header().entry_type(), EntryType::Regular); + loop { + let mut buf = vec![0u8; 512]; + let sz = entry.read(&mut buf).unwrap(); + if sz == 0 { + break; + } + + let info = reader.get_current_ctx_info(); + if let Some(prev) = last { + assert_ne!(prev, info); + } + last = Some(info); + } + } + } + + #[test] + fn test_generate_comp_info() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-two-streams.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader.clone()); + tar.set_ignore_zeros(true); + let mut generator = ZranGenerator::new(reader.clone()); + generator.set_min_compressed_size(1024); + generator.set_max_compressed_size(2048); + generator.set_max_uncompressed_size(4096); + + let entries = tar.entries().unwrap(); + for entry in entries { + let mut entry = entry.unwrap(); + if entry.header().entry_type() == EntryType::Regular { + loop { + let _start = generator.begin_read(512).unwrap(); + let mut buf = vec![0u8; 512]; + let sz = entry.read(&mut buf).unwrap(); + let _info = generator.end_read().unwrap(); + if sz == 0 { + break; + } + } + } + } + + let ctx = generator.get_compression_ctx_array(); + assert_eq!(ctx.len(), 3); + } + + #[test] + fn test_zran_decoder() { + let root_dir = &std::env::var("CARGO_MANIFEST_DIR").expect("$CARGO_MANIFEST_DIR"); + let path = PathBuf::from(root_dir).join("../tests/texture/zran/zran-two-streams.tar.gz"); + let file = OpenOptions::new().read(true).open(&path).unwrap(); + + let reader = ZranReader::new(file).unwrap(); + let mut tar = Archive::new(reader.clone()); + tar.set_ignore_zeros(true); + let mut generator = ZranGenerator::new(reader.clone()); + generator.set_min_compressed_size(1024); + generator.set_max_compressed_size(2048); + generator.set_max_uncompressed_size(4096); + + let entries = tar.entries().unwrap(); + for entry in entries { + let mut entry = entry.unwrap(); + if entry.header().entry_type() == EntryType::Regular { + loop { + let _start = generator.begin_read(512).unwrap(); + let mut buf = vec![0u8; 512]; + let sz = entry.read(&mut buf).unwrap(); + let _info = generator.end_read().unwrap(); + if sz == 0 { + break; + } + } + } + } + + let ctx_array = generator.get_compression_ctx_array(); + assert_eq!(ctx_array.len(), 3); + for i in 0..3 { + let ctx = &ctx_array[i]; + let mut c_buf = vec![0u8; ctx.in_len as usize]; + let mut file = OpenOptions::new().read(true).open(&path).unwrap(); + file.seek(SeekFrom::Start(ctx.in_offset)).unwrap(); + file.read_exact(&mut c_buf).unwrap(); + + let mut d_buf = vec![0u8; ctx.out_len as usize]; + let mut decoder = ZranDecoder::new().unwrap(); + decoder.uncompress(ctx, None, &c_buf, &mut d_buf).unwrap(); + } + } +}