Skip to content

Commit f209f98

Browse files
LiaCastanedaalamb
andauthored
Upgrade to arrow 56.1.0 (apache#17275) (#57)
* Update to arrow/parquet 56.1.0 * Adjust for new parquet sizes, update for deprecated API * Thread through max_predicate_cache_size, add test (cherry picked from commit 980c948) Co-authored-by: Andrew Lamb <andrew@nerdnetworks.org>
1 parent 1075a01 commit f209f98

File tree

21 files changed

+310
-68
lines changed

21 files changed

+310
-68
lines changed

Cargo.lock

Lines changed: 35 additions & 34 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -90,19 +90,19 @@ ahash = { version = "0.8", default-features = false, features = [
9090
"runtime-rng",
9191
] }
9292
apache-avro = { version = "0.20", default-features = false }
93-
arrow = { version = "56.0.0", features = [
93+
arrow = { version = "56.1.0", features = [
9494
"prettyprint",
9595
"chrono-tz",
9696
] }
97-
arrow-buffer = { version = "56.0.0", default-features = false }
98-
arrow-flight = { version = "56.0.0", features = [
97+
arrow-buffer = { version = "56.1.0", default-features = false }
98+
arrow-flight = { version = "56.1.0", features = [
9999
"flight-sql-experimental",
100100
] }
101-
arrow-ipc = { version = "56.0.0", default-features = false, features = [
101+
arrow-ipc = { version = "56.1.0", default-features = false, features = [
102102
"lz4",
103103
] }
104-
arrow-ord = { version = "56.0.0", default-features = false }
105-
arrow-schema = { version = "56.0.0", default-features = false }
104+
arrow-ord = { version = "56.1.0", default-features = false }
105+
arrow-schema = { version = "56.1.0", default-features = false }
106106
async-trait = "0.1.89"
107107
bigdecimal = "0.4.8"
108108
bytes = "1.10"
@@ -157,7 +157,7 @@ itertools = "0.14"
157157
log = "^0.4"
158158
object_store = { version = "0.12.3", default-features = false }
159159
parking_lot = "0.12"
160-
parquet = { version = "56.0.0", default-features = false, features = [
160+
parquet = { version = "56.1.0", default-features = false, features = [
161161
"arrow",
162162
"async",
163163
"object_store",

datafusion-cli/src/main.rs

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -570,15 +570,15 @@ mod tests {
570570
let df = ctx.sql(sql).await?;
571571
let rbs = df.collect().await?;
572572

573-
assert_snapshot!(batches_to_string(&rbs),@r#"
573+
assert_snapshot!(batches_to_string(&rbs),@r"
574574
+-----------------------------------+-----------------+---------------------+------+------------------+
575575
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
576576
+-----------------------------------+-----------------+---------------------+------+------------------+
577577
| alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false |
578-
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
578+
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
579579
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false |
580580
+-----------------------------------+-----------------+---------------------+------+------------------+
581-
"#);
581+
");
582582

583583
// increase the number of hits
584584
ctx.sql("select * from alltypes_plain")
@@ -601,15 +601,15 @@ mod tests {
601601
let df = ctx.sql(sql).await?;
602602
let rbs = df.collect().await?;
603603

604-
assert_snapshot!(batches_to_string(&rbs),@r#"
604+
assert_snapshot!(batches_to_string(&rbs),@r"
605605
+-----------------------------------+-----------------+---------------------+------+------------------+
606606
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
607607
+-----------------------------------+-----------------+---------------------+------+------------------+
608608
| alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false |
609-
| alltypes_tiny_pages.parquet | 454233 | 881634 | 2 | page_index=true |
609+
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
610610
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false |
611611
+-----------------------------------+-----------------+---------------------+------+------------------+
612-
"#);
612+
");
613613

614614
Ok(())
615615
}

datafusion/common/src/config.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -560,6 +560,14 @@ config_namespace! {
560560
/// (reading) Use any available bloom filters when reading parquet files
561561
pub bloom_filter_on_read: bool, default = true
562562

563+
/// (reading) The maximum predicate cache size, in bytes. When
564+
/// `pushdown_filters` is enabled, sets the maximum memory used to cache
565+
/// the results of predicate evaluation between filter evaluation and
566+
/// output generation. Decreasing this value will reduce memory usage,
567+
/// but may increase IO and CPU usage. None means use the default
568+
/// parquet reader setting. 0 means no caching.
569+
pub max_predicate_cache_size: Option<usize>, default = None
570+
563571
// The following options affect writing to parquet files
564572
// and map to parquet::file::properties::WriterProperties
565573

datafusion/common/src/file_options/parquet_writer.rs

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ impl ParquetOptions {
233233
binary_as_string: _, // not used for writer props
234234
coerce_int96: _, // not used for writer props
235235
skip_arrow_metadata: _,
236+
max_predicate_cache_size: _,
236237
} = self;
237238

238239
let mut builder = WriterProperties::builder()
@@ -425,6 +426,10 @@ pub(crate) fn parse_statistics_string(str_setting: &str) -> Result<EnabledStatis
425426
#[cfg(feature = "parquet")]
426427
#[cfg(test)]
427428
mod tests {
429+
use super::*;
430+
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
431+
#[cfg(feature = "parquet_encryption")]
432+
use crate::encryption::map_encryption_to_config_encryption;
428433
use parquet::{
429434
basic::Compression,
430435
file::properties::{
@@ -434,11 +439,6 @@ mod tests {
434439
};
435440
use std::collections::HashMap;
436441

437-
use super::*;
438-
use crate::config::{ParquetColumnOptions, ParquetEncryptionOptions, ParquetOptions};
439-
#[cfg(feature = "parquet_encryption")]
440-
use crate::encryption::map_encryption_to_config_encryption;
441-
442442
const COL_NAME: &str = "configured";
443443

444444
/// Take the column defaults provided in [`ParquetOptions`], and generate a non-default col config.
@@ -500,6 +500,7 @@ mod tests {
500500
binary_as_string: defaults.binary_as_string,
501501
skip_arrow_metadata: defaults.skip_arrow_metadata,
502502
coerce_int96: None,
503+
max_predicate_cache_size: defaults.max_predicate_cache_size,
503504
}
504505
}
505506

@@ -606,6 +607,8 @@ mod tests {
606607
maximum_buffered_record_batches_per_stream: global_options_defaults
607608
.maximum_buffered_record_batches_per_stream,
608609
bloom_filter_on_read: global_options_defaults.bloom_filter_on_read,
610+
max_predicate_cache_size: global_options_defaults
611+
.max_predicate_cache_size,
609612
schema_force_view_types: global_options_defaults.schema_force_view_types,
610613
binary_as_string: global_options_defaults.binary_as_string,
611614
skip_arrow_metadata: global_options_defaults.skip_arrow_metadata,

0 commit comments

Comments
 (0)