Skip to content

Commit

Permalink
Experimental support for LZMA / XZ codec (#127)
Browse files Browse the repository at this point in the history
* Add experimental LZMA / XZ support

* Reduce max size for test_variants_different_dtypes

Closes #123

* Switch to xz2 crate
  • Loading branch information
milesgranger authored Jan 22, 2024
1 parent 1876cbd commit 2d710c7
Show file tree
Hide file tree
Showing 12 changed files with 253 additions and 6 deletions.
25 changes: 23 additions & 2 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion cramjam-python/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "cramjam-python"
version = "2.8.0"
version = "2.8.1-dev1"
authors = ["Miles Granger <miles59923@gmail.com>"]
edition = "2021"
license = "MIT"
Expand Down
3 changes: 3 additions & 0 deletions cramjam-python/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,9 @@ bench-bzip2:
bench-zstd:
$(BASE_BENCH_CMD) zstd

bench-lzma:
$(BASE_BENCH_CMD) lzma

dev-install:
rm -rf ./dist
maturin build --release --out dist --interpreter $(shell which python)
Expand Down
1 change: 1 addition & 0 deletions cramjam-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ Available algorithms:
- [X] Gzip
- [X] Deflate
- [X] ZSTD
- [X] LZMA / XZ (cramjam.experimental.lzma) # experimental support!

All available for use as:

Expand Down
41 changes: 41 additions & 0 deletions cramjam-python/benchmarks/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -401,3 +401,44 @@ test_zstd[urls.10K-cramjam] 2,552.0180 (41.54) 6,011.9
test_zstd[urls.10K-zstd] 3,323.8390 (54.10) 5,461.0030 (37.50) 3,551.0236 (52.56) 229.1187 (61.53) 3,528.1250 (52.60) 122.4380 (44.83) 7;7 281.6089 (0.02) 243 1
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```


#### LZMA

`make bench-lzma`

```bash
-------------------------------------------------------------------------------------------------------- benchmark: 28 tests --------------------------------------------------------------------------------------------------------
Name (time in ms) Min Max Mean StdDev Median IQR Outliers OPS Rounds Iterations
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
test_lzma[Mark.Twain-Tom.Sawyer.txt-cramjam] 2.3947 (1.02) 5.7831 (1.23) 2.5805 (1.0) 0.3105 (2.09) 2.5408 (1.0) 0.0762 (1.0) 2;7 387.5274 (1.0) 145 1
test_lzma[Mark.Twain-Tom.Sawyer.txt-lzma] 2.3582 (1.0) 4.6904 (1.0) 2.5993 (1.01) 0.1785 (1.20) 2.5776 (1.01) 0.1257 (1.65) 32;19 384.7239 (0.99) 327 1
test_lzma[alice29.txt-cramjam] 28.0502 (11.89) 31.9519 (6.81) 29.8548 (11.57) 0.8266 (5.57) 29.8514 (11.75) 0.8356 (10.97) 8;3 33.4954 (0.09) 35 1
test_lzma[alice29.txt-lzma] 29.5697 (12.54) 30.4112 (6.48) 30.0151 (11.63) 0.1881 (1.27) 30.0110 (11.81) 0.2454 (3.22) 7;0 33.3165 (0.09) 33 1
test_lzma[asyoulik.txt-cramjam] 23.5887 (10.00) 25.8111 (5.50) 23.9925 (9.30) 0.3971 (2.68) 23.8906 (9.40) 0.1810 (2.38) 4;5 41.6796 (0.11) 41 1
test_lzma[asyoulik.txt-lzma] 23.7592 (10.08) 42.0096 (8.96) 34.4439 (13.35) 8.8400 (59.58) 41.7199 (16.42) 17.8603 (234.43) 16;0 29.0327 (0.07) 39 1
test_lzma[fifty-four-mb-random-cramjam] 15,819.4464 (>1000.0) 16,100.5117 (>1000.0) 15,919.2390 (>1000.0) 127.4622 (859.03) 15,837.1569 (>1000.0) 202.0210 (>1000.0) 1;0 0.0628 (0.00) 5 1
test_lzma[fifty-four-mb-random-lzma] 16,032.8752 (>1000.0) 16,277.5432 (>1000.0) 16,151.0817 (>1000.0) 88.0539 (593.44) 16,157.7523 (>1000.0) 92.8587 (>1000.0) 2;0 0.0619 (0.00) 5 1
test_lzma[fifty-four-mb-repeating-cramjam] 698.1282 (296.04) 709.9000 (151.35) 701.2151 (271.74) 4.9695 (33.49) 699.1874 (275.18) 4.8818 (64.08) 1;0 1.4261 (0.00) 5 1
test_lzma[fifty-four-mb-repeating-lzma] 692.3175 (293.58) 696.7862 (148.55) 695.2843 (269.44) 1.7417 (11.74) 695.5683 (273.76) 1.6960 (22.26) 1;0 1.4383 (0.00) 5 1
test_lzma[fireworks.jpeg-cramjam] 11.3957 (4.83) 13.3655 (2.85) 11.9349 (4.63) 0.3479 (2.34) 11.8311 (4.66) 0.3853 (5.06) 14;2 83.7880 (0.22) 54 1
test_lzma[fireworks.jpeg-lzma] 12.1977 (5.17) 14.2095 (3.03) 12.9411 (5.02) 0.3885 (2.62) 12.9547 (5.10) 0.5422 (7.12) 20;1 77.2733 (0.20) 71 1
test_lzma[geo.protodata-cramjam] 8.7642 (3.72) 10.6508 (2.27) 9.4973 (3.68) 0.3392 (2.29) 9.5402 (3.75) 0.4896 (6.43) 31;1 105.2934 (0.27) 92 1
test_lzma[geo.protodata-lzma] 8.9854 (3.81) 9.6380 (2.05) 9.1964 (3.56) 0.1484 (1.0) 9.1581 (3.60) 0.1816 (2.38) 25;5 108.7380 (0.28) 96 1
test_lzma[html-cramjam] 9.8727 (4.19) 10.6944 (2.28) 10.1512 (3.93) 0.1586 (1.07) 10.1483 (3.99) 0.2134 (2.80) 32;2 98.5104 (0.25) 96 1
test_lzma[html-lzma] 9.8842 (4.19) 10.7465 (2.29) 10.2713 (3.98) 0.2212 (1.49) 10.2653 (4.04) 0.4118 (5.40) 39;0 97.3591 (0.25) 99 1
test_lzma[html_x_4-cramjam] 27.2375 (11.55) 29.1826 (6.22) 28.2651 (10.95) 0.7348 (4.95) 28.8029 (11.34) 1.4011 (18.39) 13;0 35.3793 (0.09) 35 1
test_lzma[html_x_4-lzma] 27.0631 (11.48) 28.5466 (6.09) 27.7356 (10.75) 0.3388 (2.28) 27.7787 (10.93) 0.4396 (5.77) 10;0 36.0547 (0.09) 36 1
test_lzma[kppkn.gtb-cramjam] 42.0213 (17.82) 42.5622 (9.07) 42.2960 (16.39) 0.1657 (1.12) 42.2719 (16.64) 0.3047 (4.00) 10;0 23.6429 (0.06) 24 1
test_lzma[kppkn.gtb-lzma] 41.9663 (17.80) 43.1549 (9.20) 42.3119 (16.40) 0.2349 (1.58) 42.2610 (16.63) 0.2133 (2.80) 4;1 23.6340 (0.06) 24 1
test_lzma[lcet10.txt-cramjam] 84.9825 (36.04) 86.3458 (18.41) 85.5724 (33.16) 0.4918 (3.31) 85.4851 (33.64) 0.9044 (11.87) 4;0 11.6860 (0.03) 12 1
test_lzma[lcet10.txt-lzma] 85.4119 (36.22) 89.5356 (19.09) 88.1896 (34.18) 1.0074 (6.79) 88.2839 (34.75) 0.4725 (6.20) 3;3 11.3392 (0.03) 12 1
test_lzma[paper-100k.pdf-cramjam] 13.5110 (5.73) 15.4715 (3.30) 13.9447 (5.40) 0.3596 (2.42) 13.8906 (5.47) 0.3166 (4.16) 11;3 71.7116 (0.19) 58 1
test_lzma[paper-100k.pdf-lzma] 13.4903 (5.72) 15.6489 (3.34) 13.8804 (5.38) 0.3380 (2.28) 13.8328 (5.44) 0.3119 (4.09) 9;3 72.0441 (0.19) 69 1
test_lzma[plrabn12.txt-cramjam] 104.2685 (44.21) 105.8319 (22.56) 104.6718 (40.56) 0.4610 (3.11) 104.5525 (41.15) 0.3270 (4.29) 1;1 9.5537 (0.02) 10 1
test_lzma[plrabn12.txt-lzma] 104.2437 (44.20) 105.7301 (22.54) 105.0866 (40.72) 0.3972 (2.68) 105.0992 (41.36) 0.3382 (4.44) 2;1 9.5160 (0.02) 10 1
test_lzma[urls.10K-cramjam] 115.7511 (49.08) 121.7127 (25.95) 118.9406 (46.09) 2.0999 (14.15) 119.2785 (46.94) 3.7894 (49.74) 4;0 8.4076 (0.02) 9 1
test_lzma[urls.10K-lzma] 114.8733 (48.71) 118.0015 (25.16) 115.8224 (44.88) 0.9726 (6.55) 115.6923 (45.53) 1.0803 (14.18) 1;1 8.6339 (0.02) 9 1
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
```

24 changes: 24 additions & 0 deletions cramjam-python/benchmarks/test_bench.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,30 @@ def test_bzip2(benchmark, file, use_cramjam: bool):
)


@pytest.mark.parametrize(
"use_cramjam", (True, False), ids=lambda val: "cramjam" if val else "lzma"
)
@pytest.mark.parametrize("file", FILES, ids=lambda val: val.name)
def test_lzma(benchmark, file, use_cramjam: bool):
import lzma

data = file.read_bytes()
if use_cramjam:
benchmark(
round_trip,
compress=cramjam.experimental.lzma.compress,
decompress=cramjam.experimental.lzma.decompress,
data=data,
)
else:
benchmark(
round_trip,
compress=lzma.compress,
decompress=lzma.decompress,
data=data,
)


@profile
def memory_profile():
import snappy
Expand Down
123 changes: 123 additions & 0 deletions cramjam-python/src/experimental.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
//! Experimental and unstable implementations.
//! This module makes no effort to maintain SemVer between
//! releases.
use pyo3::prelude::*;
use pyo3::PyResult;

pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> {
Python::with_gil(|py| add_experimental_modules(py, m))?;
Ok(())
}
fn add_experimental_modules(py: Python, m: &PyModule) -> PyResult<()> {
let lzma_module = PyModule::new(py, "lzma")?;
lzma::init_py_module(lzma_module)?;
m.add_submodule(lzma_module)?;
Ok(())
}

pub mod lzma {
//! lzma de/compression interface
use crate::exceptions::{CompressionError, DecompressionError};
use crate::io::{AsBytes, RustyBuffer};
use crate::BytesType;
use pyo3::exceptions::PyNotImplementedError;
use pyo3::prelude::*;
use pyo3::wrap_pyfunction;
use pyo3::PyResult;
use std::io::Cursor;

pub(crate) fn init_py_module(m: &PyModule) -> PyResult<()> {
m.add_function(wrap_pyfunction!(compress, m)?)?;
m.add_function(wrap_pyfunction!(decompress, m)?)?;
m.add_function(wrap_pyfunction!(compress_into, m)?)?;
m.add_function(wrap_pyfunction!(decompress_into, m)?)?;
m.add_class::<Compressor>()?;
m.add_class::<Decompressor>()?;
Ok(())
}
/// LZMA decompression.
///
/// Python Example
/// --------------
/// ```python
/// >>> # bytes or bytearray; bytearray is faster
/// >>> cramjam.experimental.lzma.decompress(compressed_bytes, output_len=Optional[None])
/// ```
#[pyfunction]
pub fn decompress(py: Python, data: BytesType, output_len: Option<usize>) -> PyResult<RustyBuffer> {
crate::generic!(py, libcramjam::lzma::decompress[data], output_len = output_len)
.map_err(DecompressionError::from_err)
}

/// LZMA compression.
///
/// Python Example
/// --------------
/// ```python
/// >>> _ = cramjam.experimental.lzma.compress(b'some bytes here')
/// ```
#[pyfunction]
pub fn compress(
py: Python,
data: BytesType,
preset: Option<u32>,
output_len: Option<usize>,
) -> PyResult<RustyBuffer> {
crate::generic!(
py,
libcramjam::lzma::compress[data],
output_len = output_len,
level = preset
)
.map_err(CompressionError::from_err)
}

/// Compress directly into an output buffer
#[pyfunction]
pub fn compress_into(py: Python, input: BytesType, mut output: BytesType, preset: Option<u32>) -> PyResult<usize> {
crate::generic!(py, libcramjam::lzma::compress[input, output], level = preset)
.map_err(CompressionError::from_err)
}

/// Decompress directly into an output buffer
#[pyfunction]
pub fn decompress_into(py: Python, input: BytesType, mut output: BytesType) -> PyResult<usize> {
crate::generic!(py, libcramjam::lzma::decompress[input, output]).map_err(DecompressionError::from_err)
}
/// Snappy Compressor object for streaming compression
#[pyclass]
pub struct Compressor {
inner: Option<libcramjam::lzma::xz2::write::XzEncoder<Cursor<Vec<u8>>>>,
}

#[pymethods]
impl Compressor {
/// Initialize a new `Compressor` instance.
#[new]
pub fn __init__(preset: Option<u32>) -> PyResult<Self> {
let preset = preset.unwrap_or(5);
let inner = libcramjam::lzma::xz2::write::XzEncoder::new(Cursor::new(vec![]), preset);
Ok(Self { inner: Some(inner) })
}

/// Compress input into the current compressor's stream.
pub fn compress(&mut self, input: &[u8]) -> PyResult<usize> {
crate::io::stream_compress(&mut self.inner, input)
}

/// Flush and return current compressed stream
pub fn flush(&mut self) -> PyResult<RustyBuffer> {
Err(PyNotImplementedError::new_err(
"`.flush` for LZMA not implemented, just use `.finish()` instead when your done.",
))
}

/// Consume the current compressor state and return the compressed stream
/// **NB** The compressor will not be usable after this method is called.
pub fn finish(&mut self) -> PyResult<RustyBuffer> {
crate::io::stream_finish(&mut self.inner, |inner| inner.finish().map(|c| c.into_inner()))
}
}

crate::make_decompressor!(lzma);
}
2 changes: 2 additions & 0 deletions cramjam-python/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ pub mod brotli;
pub mod bzip2;
pub mod deflate;
pub mod exceptions;
pub mod experimental;
pub mod gzip;
pub mod io;
pub mod lz4;
Expand Down Expand Up @@ -369,6 +370,7 @@ fn cramjam(py: Python, m: &PyModule) -> PyResult<()> {
make_submodule!(py -> m -> gzip);
make_submodule!(py -> m -> deflate);
make_submodule!(py -> m -> zstd);
make_submodule!(py -> m -> experimental);

Ok(())
}
11 changes: 9 additions & 2 deletions cramjam-python/tests/test_variants.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@
from hypothesis import strategies as st, given, settings
from hypothesis.extra import numpy as st_np

VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd")
VARIANTS = ("snappy", "brotli", "bzip2", "lz4", "gzip", "deflate", "zstd", "lzma")

# LZMA is experimental, but in testing we'll treat it like it's not in the
# experimental submodule.
# TODO: Maybe rename it to XZ, since LZMA is the legacy version.
# ref: https://github.com/fpgaminer/rust-lzma/issues/18, but then
# the rustlib and the clib both are lzma... so maybe not?
cramjam.lzma = cramjam.experimental.lzma

# Some OS can be slow or have higher variability in their runtimes on CI
settings.register_profile(
Expand All @@ -32,7 +39,7 @@ def test_has_version():


@pytest.mark.parametrize("variant_str", VARIANTS)
@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e5))))
@given(arr=st_np.arrays(st_np.scalar_dtypes(), shape=st.integers(0, int(1e4))))
def test_variants_different_dtypes(variant_str, arr):
variant = getattr(cramjam, variant_str)
compressed = variant.compress(arr)
Expand Down
3 changes: 2 additions & 1 deletion libcramjam/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "libcramjam"
version = "0.1.5"
version = "0.2.0"
edition = "2021"
license = "MIT"
description = "Compression library combining a plethora of algorithms in a similar as possible API"
Expand All @@ -21,6 +21,7 @@ libdeflater = "^1"
zstd = "0.11.1+zstd.1.5.2"
zstd-safe = "7.0.0" # NOTE: This is the same dep version as zstd, as they don't re-export
libc = { version = "0.2", optional = true }
xz2 = { version = "0.1.7", features = ["static"] }

[build-dependencies]
cbindgen = "^0.24"
Expand Down
2 changes: 2 additions & 0 deletions libcramjam/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ pub mod bzip2;
pub mod deflate;
pub mod gzip;
pub mod lz4;
pub mod lzma;
pub mod snappy;
pub mod zstd;

Expand Down Expand Up @@ -80,4 +81,5 @@ mod tests {
test_variant!(deflate, compressed_len = 157174, level = None);
test_variant!(zstd, compressed_len = 4990, level = None);
test_variant!(lz4, compressed_len = 303278, level = None);
test_variant!(lzma, compressed_len = 8020, level = None);
}
22 changes: 22 additions & 0 deletions libcramjam/src/lzma.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
//! snappy de/compression interface
use std::io;
use std::io::{Read, Result, Write};
pub use xz2;
use xz2::read::{XzDecoder, XzEncoder};

/// Decompress snappy data framed
#[inline(always)]
pub fn decompress<W: Write + ?Sized, R: Read>(input: R, output: &mut W) -> Result<usize> {
let mut decoder = XzDecoder::new(input);
let n_bytes = io::copy(&mut decoder, output)?;
Ok(n_bytes as usize)
}

/// Decompress snappy data framed
#[inline(always)]
pub fn compress<W: Write + ?Sized, R: Read>(data: R, output: &mut W, preset: Option<u32>) -> Result<usize> {
let preset = preset.unwrap_or(6); // same as python default
let mut encoder = XzEncoder::new(data, preset);
let n_bytes = io::copy(&mut encoder, output)?;
Ok(n_bytes as usize)
}

0 comments on commit 2d710c7

Please sign in to comment.