diff --git a/Cargo.lock b/Cargo.lock index f0dab373..d06aea01 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -300,7 +300,7 @@ dependencies = [ [[package]] name = "cramjam-python" -version = "2.7.0" +version = "2.8.1-dev1" dependencies = [ "libcramjam", "pyo3", @@ -511,7 +511,7 @@ checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c" [[package]] name = "libcramjam" -version = "0.1.5" +version = "0.2.0" dependencies = [ "brotli", "bzip2", @@ -522,6 +522,7 @@ dependencies = [ "libdeflater", "lz4", "snap", + "xz2", "zstd", "zstd-safe 7.0.0", ] @@ -592,6 +593,17 @@ dependencies = [ "libc", ] +[[package]] +name = "lzma-sys" +version = "0.1.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "memchr" version = "2.6.4" @@ -1253,6 +1265,15 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a515f5799fe4961cb532f983ce2b23082366b898e52ffbce459c86f67c8378a" +[[package]] +name = "xz2" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" +dependencies = [ + "lzma-sys", +] + [[package]] name = "zstd" version = "0.11.2+zstd.1.5.2" diff --git a/cramjam-python/Cargo.toml b/cramjam-python/Cargo.toml index 53185fce..1c3c82cd 100644 --- a/cramjam-python/Cargo.toml +++ b/cramjam-python/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "cramjam-python" -version = "2.8.0" +version = "2.8.1-dev1" authors = ["Miles Granger "] edition = "2021" license = "MIT" diff --git a/cramjam-python/Makefile b/cramjam-python/Makefile index 12a9078e..a56ed141 100644 --- a/cramjam-python/Makefile +++ b/cramjam-python/Makefile @@ -36,6 +36,9 @@ bench-bzip2: bench-zstd: $(BASE_BENCH_CMD) zstd +bench-lzma: + $(BASE_BENCH_CMD) lzma + dev-install: rm -rf ./dist maturin build --release --out dist --interpreter $(shell which python) diff --git a/cramjam-python/README.md b/cramjam-python/README.md index 05d03d94..9ed9d9bf 100644 --- a/cramjam-python/README.md +++ b/cramjam-python/README.md @@ -42,6 +42,7 @@ Available algorithms: - [X] Gzip - [X] Deflate - [X] ZSTD +- [X] LZMA / XZ (cramjam.experimental.lzma) # experimental support! All available for use as: diff --git a/cramjam-python/benchmarks/README.md b/cramjam-python/benchmarks/README.md index 83b20d52..fd3d70df 100644 --- a/cramjam-python/benchmarks/README.md +++ b/cramjam-python/benchmarks/README.md @@ -401,3 +401,44 @@ test_zstd[urls.10K-cramjam] 2,552.0180 (41.54) 6,011.9 test_zstd[urls.10K-zstd] 3,323.8390 (54.10) 5,461.0030 (37.50) 3,551.0236 (52.56) 229.1187 (61.53) 3,528.1250 (52.60) 122.4380 (44.83) 7;7 281.6089 (0.02) 243 1 -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- ``` + + +#### LZMA + +`make bench-lzma` + +```bash +-------------------------------------------------------------------------------------------------------- benchmark: 28 tests -------------------------------------------------------------------------------------------------------- +Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +test_lzma[Mark.Twain-Tom.Sawyer.txt-cramjam] 2.3947 (1.02) 5.7831 (1.23) 2.5805 (1.0) 0.3105 (2.09) 2.5408 (1.0) 0.0762 (1.0) 2;7 387.5274 (1.0) 145 1 +test_lzma[Mark.Twain-Tom.Sawyer.txt-lzma] 2.3582 (1.0) 4.6904 (1.0) 2.5993 (1.01) 0.1785 (1.20) 2.5776 (1.01) 0.1257 (1.65) 32;19 384.7239 (0.99) 327 1 +test_lzma[alice29.txt-cramjam] 28.0502 (11.89) 31.9519 (6.81) 29.8548 (11.57) 0.8266 (5.57) 29.8514 (11.75) 0.8356 (10.97) 8;3 33.4954 (0.09) 35 1 +test_lzma[alice29.txt-lzma] 29.5697 (12.54) 30.4112 (6.48) 30.0151 (11.63) 0.1881 (1.27) 30.0110 (11.81) 0.2454 (3.22) 7;0 33.3165 (0.09) 33 1 +test_lzma[asyoulik.txt-cramjam] 23.5887 (10.00) 25.8111 (5.50) 23.9925 (9.30) 0.3971 (2.68) 23.8906 (9.40) 0.1810 (2.38) 4;5 41.6796 (0.11) 41 1 +test_lzma[asyoulik.txt-lzma] 23.7592 (10.08) 42.0096 (8.96) 34.4439 (13.35) 8.8400 (59.58) 41.7199 (16.42) 17.8603 (234.43) 16;0 29.0327 (0.07) 39 1 +test_lzma[fifty-four-mb-random-cramjam] 15,819.4464 (>1000.0) 16,100.5117 (>1000.0) 15,919.2390 (>1000.0) 127.4622 (859.03) 15,837.1569 (>1000.0) 202.0210 (>1000.0) 1;0 0.0628 (0.00) 5 1 +test_lzma[fifty-four-mb-random-lzma] 16,032.8752 (>1000.0) 16,277.5432 (>1000.0) 16,151.0817 (>1000.0) 88.0539 (593.44) 16,157.7523 (>1000.0) 92.8587 (>1000.0) 2;0 0.0619 (0.00) 5 1 +test_lzma[fifty-four-mb-repeating-cramjam] 698.1282 (296.04) 709.9000 (151.35) 701.2151 (271.74) 4.9695 (33.49) 699.1874 (275.18) 4.8818 (64.08) 1;0 1.4261 (0.00) 5 1 +test_lzma[fifty-four-mb-repeating-lzma] 692.3175 (293.58) 696.7862 (148.55) 695.2843 (269.44) 1.7417 (11.74) 695.5683 (273.76) 1.6960 (22.26) 1;0 1.4383 (0.00) 5 1 +test_lzma[fireworks.jpeg-cramjam] 11.3957 (4.83) 13.3655 (2.85) 11.9349 (4.63) 0.3479 (2.34) 11.8311 (4.66) 0.3853 (5.06) 14;2 83.7880 (0.22) 54 1 +test_lzma[fireworks.jpeg-lzma] 12.1977 (5.17) 14.2095 (3.03) 12.9411 (5.02) 0.3885 (2.62) 12.9547 (5.10) 0.5422 (7.12) 20;1 77.2733 (0.20) 71 1 +test_lzma[geo.protodata-cramjam] 8.7642 (3.72) 10.6508 (2.27) 9.4973 (3.68) 0.3392 (2.29) 9.5402 (3.75) 0.4896 (6.43) 31;1 105.2934 (0.27) 92 1 +test_lzma[geo.protodata-lzma] 8.9854 (3.81) 9.6380 (2.05) 9.1964 (3.56) 0.1484 (1.0) 9.1581 (3.60) 0.1816 (2.38) 25;5 108.7380 (0.28) 96 1 +test_lzma[html-cramjam] 9.8727 (4.19) 10.6944 (2.28) 10.1512 (3.93) 0.1586 (1.07) 10.1483 (3.99) 0.2134 (2.80) 32;2 98.5104 (0.25) 96 1 +test_lzma[html-lzma] 9.8842 (4.19) 10.7465 (2.29) 10.2713 (3.98) 0.2212 (1.49) 10.2653 (4.04) 0.4118 (5.40) 39;0 97.3591 (0.25) 99 1 +test_lzma[html_x_4-cramjam] 27.2375 (11.55) 29.1826 (6.22) 28.2651 (10.95) 0.7348 (4.95) 28.8029 (11.34) 1.4011 (18.39) 13;0 35.3793 (0.09) 35 1 +test_lzma[html_x_4-lzma] 27.0631 (11.48) 28.5466 (6.09) 27.7356 (10.75) 0.3388 (2.28) 27.7787 (10.93) 0.4396 (5.77) 10;0 36.0547 (0.09) 36 1 +test_lzma[kppkn.gtb-cramjam] 42.0213 (17.82) 42.5622 (9.07) 42.2960 (16.39) 0.1657 (1.12) 42.2719 (16.64) 0.3047 (4.00) 10;0 23.6429 (0.06) 24 1 +test_lzma[kppkn.gtb-lzma] 41.9663 (17.80) 43.1549 (9.20) 42.3119 (16.40) 0.2349 (1.58) 42.2610 (16.63) 0.2133 (2.80) 4;1 23.6340 (0.06) 24 1 +test_lzma[lcet10.txt-cramjam] 84.9825 (36.04) 86.3458 (18.41) 85.5724 (33.16) 0.4918 (3.31) 85.4851 (33.64) 0.9044 (11.87) 4;0 11.6860 (0.03) 12 1 +test_lzma[lcet10.txt-lzma] 85.4119 (36.22) 89.5356 (19.09) 88.1896 (34.18) 1.0074 (6.79) 88.2839 (34.75) 0.4725 (6.20) 3;3 11.3392 (0.03) 12 1 +test_lzma[paper-100k.pdf-cramjam] 13.5110 (5.73) 15.4715 (3.30) 13.9447 (5.40) 0.3596 (2.42) 13.8906 (5.47) 0.3166 (4.16) 11;3 71.7116 (0.19) 58 1 +test_lzma[paper-100k.pdf-lzma] 13.4903 (5.72) 15.6489 (3.34) 13.8804 (5.38) 0.3380 (2.28) 13.8328 (5.44) 0.3119 (4.09) 9;3 72.0441 (0.19) 69 1 +test_lzma[plrabn12.txt-cramjam] 104.2685 (44.21) 105.8319 (22.56) 104.6718 (40.56) 0.4610 (3.11) 104.5525 (41.15) 0.3270 (4.29) 1;1 9.5537 (0.02) 10 1 +test_lzma[plrabn12.txt-lzma] 104.2437 (44.20) 105.7301 (22.54) 105.0866 (40.72) 0.3972 (2.68) 105.0992 (41.36) 0.3382 (4.44) 2;1 9.5160 (0.02) 10 1 +test_lzma[urls.10K-cramjam] 115.7511 (49.08) 121.7127 (25.95) 118.9406 (46.09) 2.0999 (14.15) 119.2785 (46.94) 3.7894 (49.74) 4;0 8.4076 (0.02) 9 1 +test_lzma[urls.10K-lzma] 114.8733 (48.71) 118.0015 (25.16) 115.8224 (44.88) 0.9726 (6.55) 115.6923 (45.53) 1.0803 (14.18) 1;1 8.6339 (0.02) 9 1 +------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- +``` + diff --git a/cramjam-python/benchmarks/test_bench.py b/cramjam-python/benchmarks/test_bench.py index a4edc154..5ae19b56 100644 --- a/cramjam-python/benchmarks/test_bench.py +++ b/cramjam-python/benchmarks/test_bench.py @@ -283,6 +283,30 @@ def test_bzip2(benchmark, file, use_cramjam: bool): ) +@pytest.mark.parametrize( + "use_cramjam", (True, False), ids=lambda val: "cramjam" if val else "lzma" +) +@pytest.mark.parametrize("file", FILES, ids=lambda val: val.name) +def test_lzma(benchmark, file, use_cramjam: bool): + import lzma + + data = file.read_bytes() + if use_cramjam: + benchmark( + round_trip, + compress=cramjam.experimental.lzma.compress, + decompress=cramjam.experimental.lzma.decompress, + data=data, + ) + else: + benchmark( + round_trip, + compress=lzma.compress, + decompress=lzma.decompress, + data=data, + ) + + @profile def memory_profile(): import snappy diff --git a/cramjam-python/src/experimental.rs b/cramjam-python/src/experimental.rs new file mode 100644 index 00000000..b6b7a6f6 --- /dev/null +++ b/cramjam-python/src/experimental.rs @@ -0,0 +1,123 @@ +//! Experimental and unstable implementations. +//! This module makes no effort to maintain SemVer between +//! releases. +use pyo3::prelude::*; +use pyo3::PyResult; + +pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { + Python::with_gil(|py| add_experimental_modules(py, m))?; + Ok(()) +} +fn add_experimental_modules(py: Python, m: &PyModule) -> PyResult<()> { + let lzma_module = PyModule::new(py, "lzma")?; + lzma::init_py_module(lzma_module)?; + m.add_submodule(lzma_module)?; + Ok(()) +} + +pub mod lzma { + //! lzma de/compression interface + use crate::exceptions::{CompressionError, DecompressionError}; + use crate::io::{AsBytes, RustyBuffer}; + use crate::BytesType; + use pyo3::exceptions::PyNotImplementedError; + use pyo3::prelude::*; + use pyo3::wrap_pyfunction; + use pyo3::PyResult; + use std::io::Cursor; + + pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> { + m.add_function(wrap_pyfunction!(compress, m)?)?; + m.add_function(wrap_pyfunction!(decompress, m)?)?; + m.add_function(wrap_pyfunction!(compress_into, m)?)?; + m.add_function(wrap_pyfunction!(decompress_into, m)?)?; + m.add_class::()?; + m.add_class::()?; + Ok(()) + } + /// LZMA decompression. + /// + /// Python Example + /// -------------- + /// ```python + /// >>> # bytes or bytearray; bytearray is faster + /// >>> cramjam.experimental.lzma.decompress(compressed_bytes, output_len=Optional[None]) + /// ``` + #[pyfunction] + pub fn decompress(py: Python, data: BytesType, output_len: Option) -> PyResult { + crate::generic!(py, libcramjam::lzma::decompress[data], output_len = output_len) + .map_err(DecompressionError::from_err) + } + + /// LZMA compression. + /// + /// Python Example + /// -------------- + /// ```python + /// >>> _ = cramjam.experimental.lzma.compress(b'some bytes here') + /// ``` + #[pyfunction] + pub fn compress( + py: Python, + data: BytesType, + preset: Option, + output_len: Option, + ) -> PyResult { + crate::generic!( + py, + libcramjam::lzma::compress[data], + output_len = output_len, + level = preset + ) + .map_err(CompressionError::from_err) + } + + /// Compress directly into an output buffer + #[pyfunction] + pub fn compress_into(py: Python, input: BytesType, mut output: BytesType, preset: Option) -> PyResult { + crate::generic!(py, libcramjam::lzma::compress[input, output], level = preset) + .map_err(CompressionError::from_err) + } + + /// Decompress directly into an output buffer + #[pyfunction] + pub fn decompress_into(py: Python, input: BytesType, mut output: BytesType) -> PyResult { + crate::generic!(py, libcramjam::lzma::decompress[input, output]).map_err(DecompressionError::from_err) + } + /// Snappy Compressor object for streaming compression + #[pyclass] + pub struct Compressor { + inner: Option>>>, + } + + #[pymethods] + impl Compressor { + /// Initialize a new `Compressor` instance. + #[new] + pub fn __init__(preset: Option) -> PyResult { + let preset = preset.unwrap_or(5); + let inner = libcramjam::lzma::xz2::write::XzEncoder::new(Cursor::new(vec![]), preset); + Ok(Self { inner: Some(inner) }) + } + + /// Compress input into the current compressor's stream. + pub fn compress(&mut self, input: &[u8]) -> PyResult { + crate::io::stream_compress(&mut self.inner, input) + } + + /// Flush and return current compressed stream + pub fn flush(&mut self) -> PyResult { + Err(PyNotImplementedError::new_err( + "`.flush` for LZMA not implemented, just use `.finish()` instead when your done.", + )) + } + + /// Consume the current compressor state and return the compressed stream + /// **NB** The compressor will not be usable after this method is called. + pub fn finish(&mut self) -> PyResult { + crate::io::stream_finish(&mut self.inner, |inner| inner.finish().map(|c| c.into_inner())) + } + } + + crate::make_decompressor!(lzma); +} diff --git a/cramjam-python/src/lib.rs b/cramjam-python/src/lib.rs index a7202538..c494db86 100644 --- a/cramjam-python/src/lib.rs +++ b/cramjam-python/src/lib.rs @@ -55,6 +55,7 @@ pub mod brotli; pub mod bzip2; pub mod deflate; pub mod exceptions; +pub mod experimental; pub mod gzip; pub mod io; pub mod lz4; @@ -369,6 +370,7 @@ fn cramjam(py: Python, m: &PyModule) -> PyResult<()> { make_submodule!(py -> m -> gzip); make_submodule!(py -> m -> deflate); make_submodule!(py -> m -> zstd); + make_submodule!(py -> m -> experimental); Ok(()) } diff --git a/cramjam-python/tests/test_variants.py b/cramjam-python/tests/test_variants.py index 22c5afed..a24e81c1 100644 --- a/cramjam-python/tests/test_variants.py +++ b/cramjam-python/tests/test_variants.py @@ -8,7 +8,14 @@ from hypothesis import strategies as st, given, settings from hypothesis.extra import numpy as st_np -VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd") +VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd", "lzma") + +# LZMA is experimental, but in testing we'll treat it like it's not in the +# experimental submodule. +# TODO: Maybe rename it to XZ, since LZMA is the legacy version. +# ref: https://github.com/fpgaminer/rust-lzma/issues/18, but then +# the rustlib and the clib both are lzma... so maybe not? +cramjam.lzma = cramjam.experimental.lzma # Some OS can be slow or have higher variability in their runtimes on CI settings.register_profile( @@ -32,7 +39,7 @@ def test_has_version(): @pytest.mark.parametrize("variant_str", VARIANTS) -@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e5)))) +@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e4)))) def test_variants_different_dtypes(variant_str, arr): variant = getattr(cramjam, variant_str) compressed = variant.compress(arr) diff --git a/libcramjam/Cargo.toml b/libcramjam/Cargo.toml index f7fc1900..9e51aebc 100644 --- a/libcramjam/Cargo.toml +++ b/libcramjam/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "libcramjam" -version = "0.1.5" +version = "0.2.0" edition = "2021" license = "MIT" description = "Compression library combining a plethora of algorithms in a similar as possible API" @@ -21,6 +21,7 @@ libdeflater = "^1" zstd = "0.11.1+zstd.1.5.2" zstd-safe = "7.0.0" # NOTE: This is the same dep version as zstd, as they don't re-export libc = { version = "0.2", optional = true } +xz2 = { version = "0.1.7", features = ["static"] } [build-dependencies] cbindgen = "^0.24" diff --git a/libcramjam/src/lib.rs b/libcramjam/src/lib.rs index d036aa95..bf4e22e4 100644 --- a/libcramjam/src/lib.rs +++ b/libcramjam/src/lib.rs @@ -3,6 +3,7 @@ pub mod bzip2; pub mod deflate; pub mod gzip; pub mod lz4; +pub mod lzma; pub mod snappy; pub mod zstd; @@ -80,4 +81,5 @@ mod tests { test_variant!(deflate, compressed_len = 157174, level = None); test_variant!(zstd, compressed_len = 4990, level = None); test_variant!(lz4, compressed_len = 303278, level = None); + test_variant!(lzma, compressed_len = 8020, level = None); } diff --git a/libcramjam/src/lzma.rs b/libcramjam/src/lzma.rs new file mode 100644 index 00000000..c8451ea2 --- /dev/null +++ b/libcramjam/src/lzma.rs @@ -0,0 +1,22 @@ +//! snappy de/compression interface +use std::io; +use std::io::{Read, Result, Write}; +pub use xz2; +use xz2::read::{XzDecoder, XzEncoder}; + +/// Decompress snappy data framed +#[inline(always)] +pub fn decompress(input: R, output: &mut W) -> Result { + let mut decoder = XzDecoder::new(input); + let n_bytes = io::copy(&mut decoder, output)?; + Ok(n_bytes as usize) +} + +/// Decompress snappy data framed +#[inline(always)] +pub fn compress(data: R, output: &mut W, preset: Option) -> Result { + let preset = preset.unwrap_or(6); // same as python default + let mut encoder = XzEncoder::new(data, preset); + let n_bytes = io::copy(&mut encoder, output)?; + Ok(n_bytes as usize) +}