Merge pull request #93 from cosmicexplorer/bulk-parsing

perf: parse headers in blocks and scan for magic numbers with memchr
zip-rs · May 25, 2024 · b057d0d · b057d0d
2 parents 294564c + a28b16e
commit b057d0d
Show file tree

Hide file tree

Showing 10 changed files with 1,385 additions and 521 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -35,6 +35,7 @@ displaydoc = { version = "0.2.4", default-features = false }
 flate2 = { version = "1.0.28", default-features = false, optional = true }
 indexmap = "2"
 hmac = { version = "0.12.1", optional = true, features = ["reset"] }
+memchr = "2.7.2"
 pbkdf2 = { version = "0.12.2", optional = true }
 rand = { version = "0.8.5", optional = true }
 sha1 = { version = "0.10.6", optional = true }
@@ -56,7 +57,7 @@ arbitrary = { version = "1.3.2", features = ["derive"] }
 
 [dev-dependencies]
 bencher = "0.1.5"
-getrandom = { version = "0.2.14", features = ["js"] }
+getrandom = { version = "0.2.14", features = ["js", "std"] }
 walkdir = "2.5.0"
 time = { workspace = true, features = ["formatting", "macros"] }
 anyhow = "1"

diff --git a/benches/read_metadata.rs b/benches/read_metadata.rs
@@ -1,38 +1,126 @@
 use bencher::{benchmark_group, benchmark_main};
 
-use std::io::{Cursor, Write};
+use std::fs;
+use std::io::{self, prelude::*, Cursor};
 
 use bencher::Bencher;
+use getrandom::getrandom;
+use tempdir::TempDir;
 use zip::write::SimpleFileOptions;
-use zip::{CompressionMethod, ZipArchive, ZipWriter};
+use zip::{result::ZipResult, CompressionMethod, ZipArchive, ZipWriter};
 
 const FILE_COUNT: usize = 15_000;
 const FILE_SIZE: usize = 1024;
 
-fn generate_random_archive(count_files: usize, file_size: usize) -> Vec<u8> {
+fn generate_random_archive(count_files: usize, file_size: usize) -> ZipResult<Vec<u8>> {
     let data = Vec::new();
     let mut writer = ZipWriter::new(Cursor::new(data));
     let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
 
-    let bytes = vec![0u8; file_size];
+    let mut bytes = vec![0u8; file_size];
 
     for i in 0..count_files {
         let name = format!("file_deadbeefdeadbeefdeadbeefdeadbeefdeadbeefdeadbeef_{i}.dat");
-        writer.start_file(name, options).unwrap();
-        writer.write_all(&bytes).unwrap();
+        writer.start_file(name, options)?;
+        getrandom(&mut bytes).map_err(io::Error::from)?;
+        writer.write_all(&bytes)?;
     }
 
-    writer.finish().unwrap().into_inner()
+    Ok(writer.finish()?.into_inner())
 }
 
 fn read_metadata(bench: &mut Bencher) {
-    let bytes = generate_random_archive(FILE_COUNT, FILE_SIZE);
+    let bytes = generate_random_archive(FILE_COUNT, FILE_SIZE).unwrap();
 
     bench.iter(|| {
         let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
         archive.len()
     });
+    bench.bytes = bytes.len() as u64;
 }
 
-benchmark_group!(benches, read_metadata);
+const COMMENT_SIZE: usize = 50_000;
+
+fn generate_zip32_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
+    let data = Vec::new();
+    let mut writer = ZipWriter::new(Cursor::new(data));
+    let options = SimpleFileOptions::default().compression_method(CompressionMethod::Stored);
+
+    let mut bytes = vec![0u8; comment_length];
+    getrandom(&mut bytes).unwrap();
+    writer.set_raw_comment(bytes.into_boxed_slice());
+
+    writer.start_file("asdf.txt", options)?;
+    writer.write_all(b"asdf")?;
+
+    Ok(writer.finish()?.into_inner())
+}
+
+fn parse_archive_with_comment(bench: &mut Bencher) {
+    let bytes = generate_zip32_archive_with_random_comment(COMMENT_SIZE).unwrap();
+
+    bench.bench_n(1, |_| {
+        let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
+        let _ = archive.comment().len();
+    });
+    bench.bytes = bytes.len() as u64;
+}
+
+const COMMENT_SIZE_64: usize = 500_000;
+
+fn generate_zip64_archive_with_random_comment(comment_length: usize) -> ZipResult<Vec<u8>> {
+    let data = Vec::new();
+    let mut writer = ZipWriter::new(Cursor::new(data));
+    let options = SimpleFileOptions::default()
+        .compression_method(CompressionMethod::Stored)
+        .large_file(true);
+
+    let mut bytes = vec![0u8; comment_length];
+    getrandom(&mut bytes).unwrap();
+    writer.set_raw_comment(bytes.into_boxed_slice());
+
+    writer.start_file("asdf.txt", options)?;
+    writer.write_all(b"asdf")?;
+
+    Ok(writer.finish()?.into_inner())
+}
+
+fn parse_zip64_archive_with_comment(bench: &mut Bencher) {
+    let bytes = generate_zip64_archive_with_random_comment(COMMENT_SIZE_64).unwrap();
+
+    bench.iter(|| {
+        let archive = ZipArchive::new(Cursor::new(bytes.as_slice())).unwrap();
+        archive.comment().len()
+    });
+    bench.bytes = bytes.len() as u64;
+}
+
+fn parse_stream_archive(bench: &mut Bencher) {
+    const STREAM_ZIP_ENTRIES: usize = 5;
+    const STREAM_FILE_SIZE: usize = 5;
+
+    let bytes = generate_random_archive(STREAM_ZIP_ENTRIES, STREAM_FILE_SIZE).unwrap();
+
+    /* Write to a temporary file path to incur some filesystem overhead from repeated reads */
+    let dir = TempDir::new("stream-bench").unwrap();
+    let out = dir.path().join("bench-out.zip");
+    fs::write(&out, &bytes).unwrap();
+
+    bench.iter(|| {
+        let mut f = fs::File::open(&out).unwrap();
+        while zip::read::read_zipfile_from_stream(&mut f)
+            .unwrap()
+            .is_some()
+        {}
+    });
+    bench.bytes = bytes.len() as u64;
+}
+
+benchmark_group!(
+    benches,
+    read_metadata,
+    parse_archive_with_comment,
+    parse_zip64_archive_with_comment,
+    parse_stream_archive,
+);
 benchmark_main!(benches);
diff --git a/src/compression.rs b/src/compression.rs
@@ -90,13 +90,7 @@ impl CompressionMethod {
     pub const AES: Self = CompressionMethod::Unsupported(99);
 }
 impl CompressionMethod {
-    /// Converts an u16 to its corresponding CompressionMethod
-    #[deprecated(
-        since = "0.5.7",
-        note = "use a constant to construct a compression method"
-    )]
-    pub const fn from_u16(val: u16) -> CompressionMethod {
-        #[allow(deprecated)]
+    pub(crate) const fn parse_from_u16(val: u16) -> Self {
         match val {
             0 => CompressionMethod::Stored,
             #[cfg(feature = "_deflate-any")]
@@ -111,18 +105,21 @@ impl CompressionMethod {
             93 => CompressionMethod::Zstd,
             #[cfg(feature = "aes-crypto")]
             99 => CompressionMethod::Aes,
-
+            #[allow(deprecated)]
             v => CompressionMethod::Unsupported(v),
         }
     }
 
-    /// Converts a CompressionMethod to a u16
+    /// Converts a u16 to its corresponding CompressionMethod
     #[deprecated(
         since = "0.5.7",
-        note = "to match on other compression methods, use a constant"
+        note = "use a constant to construct a compression method"
     )]
-    pub const fn to_u16(self) -> u16 {
-        #[allow(deprecated)]
+    pub const fn from_u16(val: u16) -> CompressionMethod {
+        Self::parse_from_u16(val)
+    }
+
+    pub(crate) const fn serialize_to_u16(self) -> u16 {
         match self {
             CompressionMethod::Stored => 0,
             #[cfg(feature = "_deflate-any")]
@@ -137,10 +134,19 @@ impl CompressionMethod {
             CompressionMethod::Zstd => 93,
             #[cfg(feature = "lzma")]
             CompressionMethod::Lzma => 14,
-
+            #[allow(deprecated)]
             CompressionMethod::Unsupported(v) => v,
         }
     }
+
+    /// Converts a CompressionMethod to a u16
+    #[deprecated(
+        since = "0.5.7",
+        note = "to match on other compression methods, use a constant"
+    )]
+    pub const fn to_u16(self) -> u16 {
+        self.serialize_to_u16()
+    }
 }
 
 impl Default for CompressionMethod {
@@ -180,23 +186,18 @@ mod test {
     #[test]
     fn from_eq_to() {
         for v in 0..(u16::MAX as u32 + 1) {
-            #[allow(deprecated)]
-            let from = CompressionMethod::from_u16(v as u16);
-            #[allow(deprecated)]
-            let to = from.to_u16() as u32;
+            let from = CompressionMethod::parse_from_u16(v as u16);
+            let to = from.serialize_to_u16() as u32;
             assert_eq!(v, to);
         }
     }
 
     #[test]
     fn to_eq_from() {
         fn check_match(method: CompressionMethod) {
-            #[allow(deprecated)]
-            let to = method.to_u16();
-            #[allow(deprecated)]
-            let from = CompressionMethod::from_u16(to);
-            #[allow(deprecated)]
-            let back = from.to_u16();
+            let to = method.serialize_to_u16();
+            let from = CompressionMethod::parse_from_u16(to);
+            let back = from.serialize_to_u16();
             assert_eq!(to, back);
         }