Experimental support for LZMA / XZ codec (#127)

* Add experimental LZMA / XZ support * Reduce max size for test_variants_different_dtypes Closes #123 * Switch to xz2 crate
milesgranger · Jan 22, 2024 · 2d710c7 · 2d710c7
1 parent 1876cbd
commit 2d710c7
Show file tree

Hide file tree

Showing 12 changed files with 253 additions and 6 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/cramjam-python/Cargo.toml b/cramjam-python/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "cramjam-python"
-version = "2.8.0"
+version = "2.8.1-dev1"
 authors = ["Miles Granger <miles59923@gmail.com>"]
 edition = "2021"
 license = "MIT"

diff --git a/cramjam-python/Makefile b/cramjam-python/Makefile
@@ -36,6 +36,9 @@ bench-bzip2:
 bench-zstd:
 	$(BASE_BENCH_CMD) zstd
 
+bench-lzma:
+	$(BASE_BENCH_CMD) lzma
+
 dev-install:
 	rm -rf ./dist
 	maturin build --release --out dist --interpreter $(shell which python)

diff --git a/cramjam-python/README.md b/cramjam-python/README.md
@@ -42,6 +42,7 @@ Available algorithms:
 - [X] Gzip
 - [X] Deflate
 - [X] ZSTD
+- [X] LZMA / XZ (cramjam.experimental.lzma)  # experimental support!
 
 All available for use as:
 

diff --git a/cramjam-python/benchmarks/README.md b/cramjam-python/benchmarks/README.md
@@ -401,3 +401,44 @@ test_zstd[urls.10K-cramjam]                       2,552.0180 (41.54)     6,011.9
 test_zstd[urls.10K-zstd]                          3,323.8390 (54.10)     5,461.0030 (37.50)     3,551.0236 (52.56)      229.1187 (61.53)     3,528.1250 (52.60)      122.4380 (44.83)         7;7     281.6089 (0.02)        243           1
 --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
 ```
+
+
+#### LZMA
+
+`make bench-lzma`
+
+```bash
+-------------------------------------------------------------------------------------------------------- benchmark: 28 tests --------------------------------------------------------------------------------------------------------
+Name (time in ms)                                        Min                    Max                   Mean              StdDev                 Median                 IQR            Outliers       OPS            Rounds  Iterations
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+test_lzma[Mark.Twain-Tom.Sawyer.txt-cramjam]          2.3947 (1.02)          5.7831 (1.23)          2.5805 (1.0)        0.3105 (2.09)          2.5408 (1.0)        0.0762 (1.0)           2;7  387.5274 (1.0)         145           1
+test_lzma[Mark.Twain-Tom.Sawyer.txt-lzma]             2.3582 (1.0)           4.6904 (1.0)           2.5993 (1.01)       0.1785 (1.20)          2.5776 (1.01)       0.1257 (1.65)        32;19  384.7239 (0.99)        327           1
+test_lzma[alice29.txt-cramjam]                       28.0502 (11.89)        31.9519 (6.81)         29.8548 (11.57)      0.8266 (5.57)         29.8514 (11.75)      0.8356 (10.97)         8;3   33.4954 (0.09)         35           1
+test_lzma[alice29.txt-lzma]                          29.5697 (12.54)        30.4112 (6.48)         30.0151 (11.63)      0.1881 (1.27)         30.0110 (11.81)      0.2454 (3.22)          7;0   33.3165 (0.09)         33           1
+test_lzma[asyoulik.txt-cramjam]                      23.5887 (10.00)        25.8111 (5.50)         23.9925 (9.30)       0.3971 (2.68)         23.8906 (9.40)       0.1810 (2.38)          4;5   41.6796 (0.11)         41           1
+test_lzma[asyoulik.txt-lzma]                         23.7592 (10.08)        42.0096 (8.96)         34.4439 (13.35)      8.8400 (59.58)        41.7199 (16.42)     17.8603 (234.43)       16;0   29.0327 (0.07)         39           1
+test_lzma[fifty-four-mb-random-cramjam]          15,819.4464 (>1000.0)  16,100.5117 (>1000.0)  15,919.2390 (>1000.0)  127.4622 (859.03)   15,837.1569 (>1000.0)  202.0210 (>1000.0)       1;0    0.0628 (0.00)          5           1
+test_lzma[fifty-four-mb-random-lzma]             16,032.8752 (>1000.0)  16,277.5432 (>1000.0)  16,151.0817 (>1000.0)   88.0539 (593.44)   16,157.7523 (>1000.0)   92.8587 (>1000.0)       2;0    0.0619 (0.00)          5           1
+test_lzma[fifty-four-mb-repeating-cramjam]          698.1282 (296.04)      709.9000 (151.35)      701.2151 (271.74)     4.9695 (33.49)       699.1874 (275.18)     4.8818 (64.08)         1;0    1.4261 (0.00)          5           1
+test_lzma[fifty-four-mb-repeating-lzma]             692.3175 (293.58)      696.7862 (148.55)      695.2843 (269.44)     1.7417 (11.74)       695.5683 (273.76)     1.6960 (22.26)         1;0    1.4383 (0.00)          5           1
+test_lzma[fireworks.jpeg-cramjam]                    11.3957 (4.83)         13.3655 (2.85)         11.9349 (4.63)       0.3479 (2.34)         11.8311 (4.66)       0.3853 (5.06)         14;2   83.7880 (0.22)         54           1
+test_lzma[fireworks.jpeg-lzma]                       12.1977 (5.17)         14.2095 (3.03)         12.9411 (5.02)       0.3885 (2.62)         12.9547 (5.10)       0.5422 (7.12)         20;1   77.2733 (0.20)         71           1
+test_lzma[geo.protodata-cramjam]                      8.7642 (3.72)         10.6508 (2.27)          9.4973 (3.68)       0.3392 (2.29)          9.5402 (3.75)       0.4896 (6.43)         31;1  105.2934 (0.27)         92           1
+test_lzma[geo.protodata-lzma]                         8.9854 (3.81)          9.6380 (2.05)          9.1964 (3.56)       0.1484 (1.0)           9.1581 (3.60)       0.1816 (2.38)         25;5  108.7380 (0.28)         96           1
+test_lzma[html-cramjam]                               9.8727 (4.19)         10.6944 (2.28)         10.1512 (3.93)       0.1586 (1.07)         10.1483 (3.99)       0.2134 (2.80)         32;2   98.5104 (0.25)         96           1
+test_lzma[html-lzma]                                  9.8842 (4.19)         10.7465 (2.29)         10.2713 (3.98)       0.2212 (1.49)         10.2653 (4.04)       0.4118 (5.40)         39;0   97.3591 (0.25)         99           1
+test_lzma[html_x_4-cramjam]                          27.2375 (11.55)        29.1826 (6.22)         28.2651 (10.95)      0.7348 (4.95)         28.8029 (11.34)      1.4011 (18.39)        13;0   35.3793 (0.09)         35           1
+test_lzma[html_x_4-lzma]                             27.0631 (11.48)        28.5466 (6.09)         27.7356 (10.75)      0.3388 (2.28)         27.7787 (10.93)      0.4396 (5.77)         10;0   36.0547 (0.09)         36           1
+test_lzma[kppkn.gtb-cramjam]                         42.0213 (17.82)        42.5622 (9.07)         42.2960 (16.39)      0.1657 (1.12)         42.2719 (16.64)      0.3047 (4.00)         10;0   23.6429 (0.06)         24           1
+test_lzma[kppkn.gtb-lzma]                            41.9663 (17.80)        43.1549 (9.20)         42.3119 (16.40)      0.2349 (1.58)         42.2610 (16.63)      0.2133 (2.80)          4;1   23.6340 (0.06)         24           1
+test_lzma[lcet10.txt-cramjam]                        84.9825 (36.04)        86.3458 (18.41)        85.5724 (33.16)      0.4918 (3.31)         85.4851 (33.64)      0.9044 (11.87)         4;0   11.6860 (0.03)         12           1
+test_lzma[lcet10.txt-lzma]                           85.4119 (36.22)        89.5356 (19.09)        88.1896 (34.18)      1.0074 (6.79)         88.2839 (34.75)      0.4725 (6.20)          3;3   11.3392 (0.03)         12           1
+test_lzma[paper-100k.pdf-cramjam]                    13.5110 (5.73)         15.4715 (3.30)         13.9447 (5.40)       0.3596 (2.42)         13.8906 (5.47)       0.3166 (4.16)         11;3   71.7116 (0.19)         58           1
+test_lzma[paper-100k.pdf-lzma]                       13.4903 (5.72)         15.6489 (3.34)         13.8804 (5.38)       0.3380 (2.28)         13.8328 (5.44)       0.3119 (4.09)          9;3   72.0441 (0.19)         69           1
+test_lzma[plrabn12.txt-cramjam]                     104.2685 (44.21)       105.8319 (22.56)       104.6718 (40.56)      0.4610 (3.11)        104.5525 (41.15)      0.3270 (4.29)          1;1    9.5537 (0.02)         10           1
+test_lzma[plrabn12.txt-lzma]                        104.2437 (44.20)       105.7301 (22.54)       105.0866 (40.72)      0.3972 (2.68)        105.0992 (41.36)      0.3382 (4.44)          2;1    9.5160 (0.02)         10           1
+test_lzma[urls.10K-cramjam]                         115.7511 (49.08)       121.7127 (25.95)       118.9406 (46.09)      2.0999 (14.15)       119.2785 (46.94)      3.7894 (49.74)         4;0    8.4076 (0.02)          9           1
+test_lzma[urls.10K-lzma]                            114.8733 (48.71)       118.0015 (25.16)       115.8224 (44.88)      0.9726 (6.55)        115.6923 (45.53)      1.0803 (14.18)         1;1    8.6339 (0.02)          9           1
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+```
+
diff --git a/cramjam-python/benchmarks/test_bench.py b/cramjam-python/benchmarks/test_bench.py
@@ -283,6 +283,30 @@ def test_bzip2(benchmark, file, use_cramjam: bool):
         )
 
 
+@pytest.mark.parametrize(
+    "use_cramjam", (True, False), ids=lambda val: "cramjam" if val else "lzma"
+)
+@pytest.mark.parametrize("file", FILES, ids=lambda val: val.name)
+def test_lzma(benchmark, file, use_cramjam: bool):
+    import lzma
+
+    data = file.read_bytes()
+    if use_cramjam:
+        benchmark(
+            round_trip,
+            compress=cramjam.experimental.lzma.compress,
+            decompress=cramjam.experimental.lzma.decompress,
+            data=data,
+        )
+    else:
+        benchmark(
+            round_trip,
+            compress=lzma.compress,
+            decompress=lzma.decompress,
+            data=data,
+        )
+
+
 @profile
 def memory_profile():
     import snappy

diff --git a/cramjam-python/src/experimental.rs b/cramjam-python/src/experimental.rs
@@ -0,0 +1,123 @@
+//! Experimental and unstable implementations.
+//! This module makes no effort to maintain SemVer between
+//! releases.
+use pyo3::prelude::*;
+use pyo3::PyResult;
+
+pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> {
+    Python::with_gil(|py| add_experimental_modules(py, m))?;
+    Ok(())
+}
+fn add_experimental_modules(py: Python, m: &PyModule) -> PyResult<()> {
+    let lzma_module = PyModule::new(py, "lzma")?;
+    lzma::init_py_module(lzma_module)?;
+    m.add_submodule(lzma_module)?;
+    Ok(())
+}
+
+pub mod lzma {
+    //! lzma de/compression interface
+    use crate::exceptions::{CompressionError, DecompressionError};
+    use crate::io::{AsBytes, RustyBuffer};
+    use crate::BytesType;
+    use pyo3::exceptions::PyNotImplementedError;
+    use pyo3::prelude::*;
+    use pyo3::wrap_pyfunction;
+    use pyo3::PyResult;
+    use std::io::Cursor;
+
+    pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> {
+        m.add_function(wrap_pyfunction!(compress, m)?)?;
+        m.add_function(wrap_pyfunction!(decompress, m)?)?;
+        m.add_function(wrap_pyfunction!(compress_into, m)?)?;
+        m.add_function(wrap_pyfunction!(decompress_into, m)?)?;
+        m.add_class::<Compressor>()?;
+        m.add_class::<Decompressor>()?;
+        Ok(())
+    }
+    /// LZMA decompression.
+    ///
+    /// Python Example
+    /// --------------
+    /// ```python
+    /// >>> # bytes or bytearray; bytearray is faster
+    /// >>> cramjam.experimental.lzma.decompress(compressed_bytes, output_len=Optional[None])
+    /// ```
+    #[pyfunction]
+    pub fn decompress(py: Python, data: BytesType, output_len: Option<usize>) -> PyResult<RustyBuffer> {
+        crate::generic!(py, libcramjam::lzma::decompress[data], output_len = output_len)
+            .map_err(DecompressionError::from_err)
+    }
+
+    /// LZMA compression.
+    ///
+    /// Python Example
+    /// --------------
+    /// ```python
+    /// >>> _ = cramjam.experimental.lzma.compress(b'some bytes here')
+    /// ```
+    #[pyfunction]
+    pub fn compress(
+        py: Python,
+        data: BytesType,
+        preset: Option<u32>,
+        output_len: Option<usize>,
+    ) -> PyResult<RustyBuffer> {
+        crate::generic!(
+            py,
+            libcramjam::lzma::compress[data],
+            output_len = output_len,
+            level = preset
+        )
+        .map_err(CompressionError::from_err)
+    }
+
+    /// Compress directly into an output buffer
+    #[pyfunction]
+    pub fn compress_into(py: Python, input: BytesType, mut output: BytesType, preset: Option<u32>) -> PyResult<usize> {
+        crate::generic!(py, libcramjam::lzma::compress[input, output], level = preset)
+            .map_err(CompressionError::from_err)
+    }
+
+    /// Decompress directly into an output buffer
+    #[pyfunction]
+    pub fn decompress_into(py: Python, input: BytesType, mut output: BytesType) -> PyResult<usize> {
+        crate::generic!(py, libcramjam::lzma::decompress[input, output]).map_err(DecompressionError::from_err)
+    }
+    /// Snappy Compressor object for streaming compression
+    #[pyclass]
+    pub struct Compressor {
+        inner: Option<libcramjam::lzma::xz2::write::XzEncoder<Cursor<Vec<u8>>>>,
+    }
+
+    #[pymethods]
+    impl Compressor {
+        /// Initialize a new `Compressor` instance.
+        #[new]
+        pub fn __init__(preset: Option<u32>) -> PyResult<Self> {
+            let preset = preset.unwrap_or(5);
+            let inner = libcramjam::lzma::xz2::write::XzEncoder::new(Cursor::new(vec![]), preset);
+            Ok(Self { inner: Some(inner) })
+        }
+
+        /// Compress input into the current compressor's stream.
+        pub fn compress(&mut self, input: &[u8]) -> PyResult<usize> {
+            crate::io::stream_compress(&mut self.inner, input)
+        }
+
+        /// Flush and return current compressed stream
+        pub fn flush(&mut self) -> PyResult<RustyBuffer> {
+            Err(PyNotImplementedError::new_err(
+                "`.flush` for LZMA not implemented, just use `.finish()` instead when your done.",
+            ))
+        }
+
+        /// Consume the current compressor state and return the compressed stream
+        /// **NB** The compressor will not be usable after this method is called.
+        pub fn finish(&mut self) -> PyResult<RustyBuffer> {
+            crate::io::stream_finish(&mut self.inner, |inner| inner.finish().map(|c| c.into_inner()))
+        }
+    }
+
+    crate::make_decompressor!(lzma);
+}
diff --git a/cramjam-python/src/lib.rs b/cramjam-python/src/lib.rs
@@ -55,6 +55,7 @@ pub mod brotli;
 pub mod bzip2;
 pub mod deflate;
 pub mod exceptions;
+pub mod experimental;
 pub mod gzip;
 pub mod io;
 pub mod lz4;
@@ -369,6 +370,7 @@ fn cramjam(py: Python, m: &PyModule) -> PyResult<()> {
     make_submodule!(py -> m -> gzip);
     make_submodule!(py -> m -> deflate);
     make_submodule!(py -> m -> zstd);
+    make_submodule!(py -> m -> experimental);
 
     Ok(())
 }
diff --git a/cramjam-python/tests/test_variants.py b/cramjam-python/tests/test_variants.py
@@ -8,7 +8,14 @@
 from hypothesis import strategies as st, given, settings
 from hypothesis.extra import numpy as st_np
 
-VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd")
+VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd", "lzma")
+
+# LZMA is experimental, but in testing we'll treat it like it's not in the
+# experimental submodule.
+# TODO: Maybe rename it to XZ, since LZMA is the legacy version.
+# ref: https://github.com/fpgaminer/rust-lzma/issues/18, but then
+# the rustlib and the clib both are lzma... so maybe not?
+cramjam.lzma = cramjam.experimental.lzma
 
 # Some OS can be slow or have higher variability in their runtimes on CI
 settings.register_profile(
@@ -32,7 +39,7 @@ def test_has_version():
 
 
 @pytest.mark.parametrize("variant_str", VARIANTS)
-@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e5))))
+@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e4))))
 def test_variants_different_dtypes(variant_str, arr):
     variant = getattr(cramjam, variant_str)
     compressed = variant.compress(arr)

diff --git a/libcramjam/Cargo.toml b/libcramjam/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "libcramjam"
-version = "0.1.5"
+version = "0.2.0"
 edition = "2021"
 license = "MIT"
 description = "Compression library combining a plethora of algorithms in a similar as possible API"
@@ -21,6 +21,7 @@ libdeflater = "^1"
 zstd = "0.11.1+zstd.1.5.2"
 zstd-safe = "7.0.0"  # NOTE: This is the same dep version as zstd, as they don't re-export
 libc = { version = "0.2", optional = true }
+xz2 = { version = "0.1.7", features = ["static"] }
 
 [build-dependencies]
 cbindgen = "^0.24"

diff --git a/libcramjam/src/lib.rs b/libcramjam/src/lib.rs
@@ -3,6 +3,7 @@ pub mod bzip2;
 pub mod deflate;
 pub mod gzip;
 pub mod lz4;
+pub mod lzma;
 pub mod snappy;
 pub mod zstd;
 
@@ -80,4 +81,5 @@ mod tests {
     test_variant!(deflate, compressed_len = 157174, level = None);
     test_variant!(zstd, compressed_len = 4990, level = None);
     test_variant!(lz4, compressed_len = 303278, level = None);
+    test_variant!(lzma, compressed_len = 8020, level = None);
 }
diff --git a/libcramjam/src/lzma.rs b/libcramjam/src/lzma.rs
@@ -0,0 +1,22 @@
+//! snappy de/compression interface
+use std::io;
+use std::io::{Read, Result, Write};
+pub use xz2;
+use xz2::read::{XzDecoder, XzEncoder};
+
+/// Decompress snappy data framed
+#[inline(always)]
+pub fn decompress<W: Write + ?Sized, R: Read>(input: R, output: &mut W) -> Result<usize> {
+    let mut decoder = XzDecoder::new(input);
+    let n_bytes = io::copy(&mut decoder, output)?;
+    Ok(n_bytes as usize)
+}
+
+/// Decompress snappy data framed
+#[inline(always)]
+pub fn compress<W: Write + ?Sized, R: Read>(data: R, output: &mut W, preset: Option<u32>) -> Result<usize> {
+    let preset = preset.unwrap_or(6); // same as python default
+    let mut encoder = XzEncoder::new(data, preset);
+    let n_bytes = io::copy(&mut encoder, output)?;
+    Ok(n_bytes as usize)
+}