From 8f9de92ec1652955baf577a65e055d991e48e2af Mon Sep 17 00:00:00 2001 From: Najib Date: Tue, 28 Oct 2025 16:20:34 +0100 Subject: [PATCH 1/2] Switch from xz2 to libzma --- Cargo.lock | 107 +++++++++--------- Cargo.toml | 5 + README.md | 2 +- datafusion/core/Cargo.toml | 4 +- datafusion/core/src/test/mod.rs | 2 +- datafusion/datasource/Cargo.toml | 4 +- .../datasource/src/file_compression_type.rs | 2 +- 7 files changed, 64 insertions(+), 62 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 120dc29db223..cc3e9f579fbd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -185,14 +185,14 @@ checksum = "a23eb6b1614318a8071c9b2521f36b424b2c83db5eb3a0fead4a6c0809af6e61" [[package]] name = "apache-avro" version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3a033b4ced7c585199fb78ef50fca7fe2f444369ec48080c5fd072efa1a03cc7" +source = "git+https://github.com/apache/avro-rs?rev=3b202c58f12bd1217eccf8a0028e4176ee4aadf9#3b202c58f12bd1217eccf8a0028e4176ee4aadf9" dependencies = [ "bigdecimal", "bon", - "bzip2 0.6.1", + "bzip2", "crc32fast", "digest", + "liblzma", "log", "miniz_oxide", "num-bigint", @@ -207,7 +207,6 @@ dependencies = [ "strum_macros 0.27.2", "thiserror", "uuid", - "xz2", "zstd", ] @@ -520,19 +519,15 @@ dependencies = [ [[package]] name = "async-compression" -version = "0.4.19" +version = "0.4.32" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "06575e6a9673580f52661c92107baabffbf41e2141373441cbcdc47cb733003c" +checksum = "5a89bce6054c720275ac2432fbba080a66a2106a44a1b804553930ca6909f4e0" dependencies = [ - "bzip2 0.5.2", - "flate2", + "compression-codecs", + "compression-core", "futures-core", - "memchr", "pin-project-lite", "tokio", - "xz2", - "zstd", - "zstd-safe", ] [[package]] @@ -1319,15 +1314,6 @@ dependencies = [ "either", ] -[[package]] -name = "bzip2" -version = "0.5.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49ecfb22d906f800d4fe833b6282cf4dc1c298f5057ca0b5445e5c209735ca47" -dependencies = [ - "bzip2-sys", -] - [[package]] name = "bzip2" version = "0.6.1" @@ -1337,16 +1323,6 @@ dependencies = [ "libbz2-rs-sys", ] -[[package]] -name = "bzip2-sys" -version = "0.1.13+1.0.8" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" -dependencies = [ - "cc", - "pkg-config", -] - [[package]] name = "cast" version = "0.3.0" @@ -1534,6 +1510,27 @@ dependencies = [ "unicode-width 0.2.1", ] +[[package]] +name = "compression-codecs" +version = "0.4.31" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef8a506ec4b81c460798f572caead636d57d3d7e940f998160f52bd254bf2d23" +dependencies = [ + "bzip2", + "compression-core", + "flate2", + "liblzma", + "memchr", + "zstd", + "zstd-safe", +] + +[[package]] +name = "compression-core" +version = "0.4.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e47641d3deaf41fb1538ac1f54735925e275eaf3bf4d55c81b137fba797e5cbb" + [[package]] name = "console" version = "0.15.11" @@ -1841,7 +1838,7 @@ dependencies = [ "arrow-schema", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "criterion", "ctor", @@ -1882,6 +1879,7 @@ dependencies = [ "glob", "insta", "itertools 0.14.0", + "liblzma", "log", "nix", "object_store", @@ -1901,7 +1899,6 @@ dependencies = [ "tokio", "url", "uuid", - "xz2", "zstd", ] @@ -2052,7 +2049,7 @@ dependencies = [ "async-compression", "async-trait", "bytes", - "bzip2 0.6.1", + "bzip2", "chrono", "criterion", "datafusion-common", @@ -2068,6 +2065,7 @@ dependencies = [ "futures", "glob", "itertools 0.14.0", + "liblzma", "log", "object_store", "rand 0.9.2", @@ -2075,7 +2073,6 @@ dependencies = [ "tokio", "tokio-util", "url", - "xz2", "zstd", ] @@ -4017,6 +4014,26 @@ dependencies = [ "windows-link 0.2.0", ] +[[package]] +name = "liblzma" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73c36d08cad03a3fbe2c4e7bb3a9e84c57e4ee4135ed0b065cade3d98480c648" +dependencies = [ + "liblzma-sys", +] + +[[package]] +name = "liblzma-sys" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01b9596486f6d60c3bbe644c0e1be1aa6ccc472ad630fe8927b456973d7cb736" +dependencies = [ + "cc", + "libc", + "pkg-config", +] + [[package]] name = "libm" version = "0.2.15" @@ -4109,17 +4126,6 @@ dependencies = [ "twox-hash", ] -[[package]] -name = "lzma-sys" -version = "0.1.20" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5fda04ab3764e6cde78b9974eec4f779acaba7c4e84b36eca3cf77c581b85d27" -dependencies = [ - "cc", - "libc", - "pkg-config", -] - [[package]] name = "matchit" version = "0.8.4" @@ -7534,15 +7540,6 @@ version = "0.13.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "66fee0b777b0f5ac1c69bb06d361268faafa61cd4682ae064a171c16c433e9e4" -[[package]] -name = "xz2" -version = "0.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "388c44dc09d76f1536602ead6d325eb532f5c122f17782bd57fb47baeeb767e2" -dependencies = [ - "lzma-sys", -] - [[package]] name = "yansi" version = "1.0.1" diff --git a/Cargo.toml b/Cargo.toml index bf0f3fa0510e..341a8f0128e4 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -183,6 +183,11 @@ testcontainers-modules = { version = "0.13" } tokio = { version = "1.48", features = ["macros", "rt", "sync"] } url = "2.5.7" +# Temporary override: pull apache-avro from upstream to include unreleased fixes. +# TODO: remove once the next version of apache-avro is published to crates.io and includes commit 3b202c5. +[patch.crates-io] +apache-avro = { git = "https://github.com/apache/avro-rs", rev = "3b202c58f12bd1217eccf8a0028e4176ee4aadf9" } + [workspace.lints.clippy] # Detects large stack-allocated futures that may cause stack overflow crashes (see threshold in clippy.toml) large_futures = "warn" diff --git a/README.md b/README.md index 5191496eaafe..fd2f61f3b047 100644 --- a/README.md +++ b/README.md @@ -113,7 +113,7 @@ This crate has several [features] which can be specified in your `Cargo.toml`. Default features: - `nested_expressions`: functions for working with nested type function such as `array_to_string` -- `compression`: reading files compressed with `xz2`, `bzip2`, `flate2`, and `zstd` +- `compression`: reading files compressed with `liblzma`, `bzip2`, `flate2`, and `zstd` - `crypto_expressions`: cryptographic functions such as `md5` and `sha256` - `datetime_expressions`: date and time functions such as `to_timestamp` - `encoding_expressions`: `encode` and `decode` functions diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index 22c9f43a902e..987e4bd4705d 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -43,7 +43,7 @@ array_expressions = ["nested_expressions"] avro = ["datafusion-common/avro", "datafusion-datasource-avro"] backtrace = ["datafusion-common/backtrace"] compression = [ - "xz2", + "liblzma", "bzip2", "flate2", "zstd", @@ -143,6 +143,7 @@ datafusion-sql = { workspace = true, optional = true } flate2 = { version = "1.1.4", optional = true } futures = { workspace = true } itertools = { workspace = true } +liblzma = { version = "0.4.4", optional = true, features = ["static"] } log = { workspace = true } object_store = { workspace = true } parking_lot = { workspace = true } @@ -156,7 +157,6 @@ tempfile = { workspace = true } tokio = { workspace = true } url = { workspace = true } uuid = { version = "1.18", features = ["v4", "js"] } -xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index 68f83e7f1f11..8d5167404c8e 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -55,7 +55,7 @@ use flate2::write::GzEncoder; use flate2::Compression as GzCompression; use object_store::local_unpartitioned_file; #[cfg(feature = "compression")] -use xz2::write::XzEncoder; +use liblzma::write::XzEncoder; #[cfg(feature = "compression")] use zstd::Encoder as ZstdEncoder; diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 8e0738448a75..7dfc1ce8222d 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -31,7 +31,7 @@ version.workspace = true all-features = true [features] -compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"] +compression = ["async-compression", "liblzma", "bzip2", "flate2", "zstd", "tokio-util"] default = ["compression"] [dependencies] @@ -60,6 +60,7 @@ flate2 = { version = "1.1.4", optional = true } futures = { workspace = true } glob = "0.3.0" itertools = { workspace = true } +liblzma = { version = "0.4.4", optional = true, features = ["static"] } log = { workspace = true } object_store = { workspace = true } rand = { workspace = true } @@ -67,7 +68,6 @@ tempfile = { workspace = true, optional = true } tokio = { workspace = true } tokio-util = { version = "0.7.16", features = ["io"], optional = true } url = { workspace = true } -xz2 = { version = "0.1", optional = true, features = ["static"] } zstd = { version = "0.13", optional = true, default-features = false } [dev-dependencies] diff --git a/datafusion/datasource/src/file_compression_type.rs b/datafusion/datasource/src/file_compression_type.rs index 7cc3142564e9..d3525e91ba03 100644 --- a/datafusion/datasource/src/file_compression_type.rs +++ b/datafusion/datasource/src/file_compression_type.rs @@ -48,7 +48,7 @@ use tokio::io::AsyncWrite; #[cfg(feature = "compression")] use tokio_util::io::{ReaderStream, StreamReader}; #[cfg(feature = "compression")] -use xz2::read::XzDecoder; +use liblzma::read::XzDecoder; #[cfg(feature = "compression")] use zstd::Decoder as ZstdDecoder; From 3a37c1d93329cd17c73a77421e402fd59a00499e Mon Sep 17 00:00:00 2001 From: Najib Date: Tue, 28 Oct 2025 16:40:01 +0100 Subject: [PATCH 2/2] Bump async-compression --- datafusion/datasource/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/datasource/Cargo.toml b/datafusion/datasource/Cargo.toml index 7dfc1ce8222d..86778afa0946 100644 --- a/datafusion/datasource/Cargo.toml +++ b/datafusion/datasource/Cargo.toml @@ -36,7 +36,7 @@ default = ["compression"] [dependencies] arrow = { workspace = true } -async-compression = { version = "0.4.19", features = [ +async-compression = { version = "0.4.32", features = [ "bzip2", "gzip", "xz",