Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
546 changes: 256 additions & 290 deletions Cargo.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -231,3 +231,6 @@ unexpected_cfgs = { level = "warn", check-cfg = [
"cfg(tarpaulin_include)",
] }
unused_qualifications = "deny"

[patch.crates-io]
apache-avro = { git = "https://github.com/timsaucer/avro-rs.git", branch = "chore/move_to_liblzma" }
4 changes: 2 additions & 2 deletions datafusion/core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ array_expressions = ["nested_expressions"]
avro = ["datafusion-common/avro", "datafusion-datasource-avro"]
backtrace = ["datafusion-common/backtrace"]
compression = [
"xz2",
"liblzma",
"bzip2",
"flate2",
"zstd",
Expand Down Expand Up @@ -138,6 +138,7 @@ flate2 = { version = "1.1.2", optional = true }
futures = { workspace = true }
hex = { workspace = true, optional = true }
itertools = { workspace = true }
liblzma = { version = "0.4.4", optional = true, features = ["static"] }
log = { workspace = true }
object_store = { workspace = true }
parking_lot = { workspace = true }
Expand All @@ -150,7 +151,6 @@ tempfile = { workspace = true }
tokio = { workspace = true }
url = { workspace = true }
uuid = { version = "1.18", features = ["v4", "js"] }
xz2 = { version = "0.1", optional = true, features = ["static"] }
zstd = { version = "0.13", optional = true, default-features = false }

[dev-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions datafusion/core/src/test/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ use datafusion_datasource_csv::partitioned_csv_config;
use flate2::write::GzEncoder;
#[cfg(feature = "compression")]
use flate2::Compression as GzCompression;
use object_store::local_unpartitioned_file;
#[cfg(feature = "compression")]
use xz2::write::XzEncoder;
use liblzma::write::XzEncoder;
use object_store::local_unpartitioned_file;
#[cfg(feature = "compression")]
use zstd::Encoder as ZstdEncoder;

Expand Down
6 changes: 4 additions & 2 deletions datafusion/datasource-parquet/src/metadata.rs
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,9 @@ use object_store::path::Path;
use object_store::{ObjectMeta, ObjectStore};
use parquet::arrow::arrow_reader::statistics::StatisticsConverter;
use parquet::arrow::parquet_to_arrow_schema;
use parquet::file::metadata::{ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData};
use parquet::file::metadata::{
PageIndexPolicy, ParquetMetaData, ParquetMetaDataReader, RowGroupMetaData,
};
use std::any::Any;
use std::collections::HashMap;
use std::sync::Arc;
Expand Down Expand Up @@ -148,7 +150,7 @@ impl<'a> DFParquetMetadata<'a> {

if cache_metadata && file_metadata_cache.is_some() {
// Need to retrieve the entire metadata for the caching to be effective.
reader = reader.with_page_indexes(true);
reader = reader.with_page_index_policy(PageIndexPolicy::Required);
}

let metadata = Arc::new(
Expand Down
6 changes: 3 additions & 3 deletions datafusion/datasource-parquet/src/opener.rs
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ use log::debug;
use parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
use parquet::arrow::async_reader::AsyncFileReader;
use parquet::arrow::{ParquetRecordBatchStreamBuilder, ProjectionMask};
use parquet::file::metadata::ParquetMetaDataReader;
use parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader};

/// Implements [`FileOpener`] for a parquet file
pub(super) struct ParquetOpener {
Expand Down Expand Up @@ -652,8 +652,8 @@ async fn load_page_index<T: AsyncFileReader>(
if missing_column_index || missing_offset_index {
let m = Arc::try_unwrap(Arc::clone(parquet_metadata))
.unwrap_or_else(|e| e.as_ref().clone());
let mut reader =
ParquetMetaDataReader::new_with_metadata(m).with_page_indexes(true);
let mut reader = ParquetMetaDataReader::new_with_metadata(m)
.with_page_index_policy(PageIndexPolicy::Required);
reader.load_page_index(input).await?;
let new_parquet_metadata = reader.finish()?;
let new_arrow_reader =
Expand Down
6 changes: 3 additions & 3 deletions datafusion/datasource/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -32,12 +32,12 @@ all-features = true

[features]
parquet = ["dep:parquet", "tempfile"]
compression = ["async-compression", "xz2", "bzip2", "flate2", "zstd", "tokio-util"]
compression = ["async-compression", "liblzma", "bzip2", "flate2", "zstd", "tokio-util"]
default = ["compression"]

[dependencies]
arrow = { workspace = true }
async-compression = { version = "0.4.19", features = [
async-compression = { version = "0.4.30", features = [
"bzip2",
"gzip",
"xz",
Expand All @@ -61,6 +61,7 @@ flate2 = { version = "1.1.2", optional = true }
futures = { workspace = true }
glob = "0.3.0"
itertools = { workspace = true }
liblzma = { version = "0.4.4", optional = true, features = ["static"] }
log = { workspace = true }
object_store = { workspace = true }
parquet = { workspace = true, optional = true }
Expand All @@ -69,7 +70,6 @@ tempfile = { workspace = true, optional = true }
tokio = { workspace = true }
tokio-util = { version = "0.7.16", features = ["io"], optional = true }
url = { workspace = true }
xz2 = { version = "0.1", optional = true, features = ["static"] }
zstd = { version = "0.13", optional = true, default-features = false }

[dev-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions datafusion/datasource/src/file_compression_type.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,13 +43,13 @@ use futures::stream::BoxStream;
use futures::StreamExt;
#[cfg(feature = "compression")]
use futures::TryStreamExt;
#[cfg(feature = "compression")]
use liblzma::read::XzDecoder;
use object_store::buffered::BufWriter;
use tokio::io::AsyncWrite;
#[cfg(feature = "compression")]
use tokio_util::io::{ReaderStream, StreamReader};
#[cfg(feature = "compression")]
use xz2::read::XzDecoder;
#[cfg(feature = "compression")]
use zstd::Decoder as ZstdDecoder;

/// Readable file compression type
Expand Down
Loading