Skip to content

Commit c6ad17c

Browse files
authored
Upgrade DataFusion to arrow/parquet 57.0.0 (#17888)
## Which issue does this PR close? - Related to apache/arrow-rs#7835 - Closes #3666 Note while this PR looks massive, a large portion is display updates due to better display of Fields and DataTypes ## Rationale for this change Upgrade to the latest arrow Also, there are several new features in arrow-57 that I want to be able to test including Variant, arrow-avro, and a new parquet metadata reader. ## What changes are included in this PR? 1. Update arrow/parquet 2. Update prost 3. Update substrait 4. Update pbjson 5. Make API changes to avoid deprecated APIs ## Are these changes tested? By CI ## Are there any user-facing changes? New arrow
1 parent 1feb80f commit c6ad17c

File tree

82 files changed

+1138
-1200
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

82 files changed

+1138
-1200
lines changed

Cargo.lock

Lines changed: 114 additions & 205 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -91,19 +91,19 @@ ahash = { version = "0.8", default-features = false, features = [
9191
"runtime-rng",
9292
] }
9393
apache-avro = { version = "0.20", default-features = false }
94-
arrow = { version = "56.2.0", features = [
94+
arrow = { version = "57.0.0", features = [
9595
"prettyprint",
9696
"chrono-tz",
9797
] }
98-
arrow-buffer = { version = "56.2.0", default-features = false }
99-
arrow-flight = { version = "56.2.0", features = [
98+
arrow-buffer = { version = "57.0.0", default-features = false }
99+
arrow-flight = { version = "57.0.0", features = [
100100
"flight-sql-experimental",
101101
] }
102-
arrow-ipc = { version = "56.2.0", default-features = false, features = [
102+
arrow-ipc = { version = "57.0.0", default-features = false, features = [
103103
"lz4",
104104
] }
105-
arrow-ord = { version = "56.2.0", default-features = false }
106-
arrow-schema = { version = "56.2.0", default-features = false }
105+
arrow-ord = { version = "57.0.0", default-features = false }
106+
arrow-schema = { version = "57.0.0", default-features = false }
107107
async-trait = "0.1.89"
108108
bigdecimal = "0.4.8"
109109
bytes = "1.10"
@@ -156,20 +156,20 @@ half = { version = "2.7.0", default-features = false }
156156
hashbrown = { version = "0.14.5", features = ["raw"] }
157157
hex = { version = "0.4.3" }
158158
indexmap = "2.12.0"
159+
insta = { version = "1.43.2", features = ["glob", "filters"] }
159160
itertools = "0.14"
160161
log = "^0.4"
161162
object_store = { version = "0.12.4", default-features = false }
162163
parking_lot = "0.12"
163-
parquet = { version = "56.2.0", default-features = false, features = [
164+
parquet = { version = "57.0.0", default-features = false, features = [
164165
"arrow",
165166
"async",
166167
"object_store",
167168
] }
168-
pbjson = { version = "0.7.0" }
169-
pbjson-types = "0.7"
169+
pbjson = { version = "0.8.0" }
170+
pbjson-types = "0.8"
170171
# Should match arrow-flight's version of prost.
171-
insta = { version = "1.43.2", features = ["glob", "filters"] }
172-
prost = "0.13.1"
172+
prost = "0.14.1"
173173
rand = "0.9"
174174
recursive = "0.1.1"
175175
regex = "1.12"

datafusion-cli/src/functions.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -419,7 +419,9 @@ impl TableFunctionImpl for ParquetMetadataFunc {
419419
stats_max_value_arr.push(None);
420420
};
421421
compression_arr.push(format!("{:?}", column.compression()));
422-
encodings_arr.push(format!("{:?}", column.encodings()));
422+
// need to collect into Vec to format
423+
let encodings: Vec<_> = column.encodings().collect();
424+
encodings_arr.push(format!("{:?}", encodings));
423425
index_page_offset_arr.push(column.index_page_offset());
424426
dictionary_page_offset_arr.push(column.dictionary_page_offset());
425427
data_page_offset_arr.push(column.data_page_offset());

datafusion-cli/src/main.rs

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -497,7 +497,7 @@ mod tests {
497497
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
498498
| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
499499
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
500-
| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 |
500+
| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 |
501501
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
502502
"#);
503503

@@ -510,7 +510,7 @@ mod tests {
510510
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
511511
| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
512512
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
513-
| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [RLE_DICTIONARY, PLAIN, RLE] | | 4 | 46 | 121 | 123 |
513+
| ../datafusion/core/tests/data/fixed_size_list_array.parquet | 0 | 2 | 1 | 123 | 0 | 125 | 4 | "f0.list.item" | INT64 | 1 | 4 | 0 | | 1 | 4 | SNAPPY | [PLAIN, RLE, RLE_DICTIONARY] | | 4 | 46 | 121 | 123 |
514514
+-------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+-------+-----------+-----------+------------------+----------------------+-----------------+-----------------+-------------+------------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
515515
"#);
516516

@@ -532,7 +532,7 @@ mod tests {
532532
+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
533533
| filename | row_group_id | row_group_num_rows | row_group_num_columns | row_group_bytes | column_id | file_offset | num_values | path_in_schema | type | stats_min | stats_max | stats_null_count | stats_distinct_count | stats_min_value | stats_max_value | compression | encodings | index_page_offset | dictionary_page_offset | data_page_offset | total_compressed_size | total_uncompressed_size |
534534
+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
535-
| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [BIT_PACKED, RLE, PLAIN] | | | 4 | 152 | 163 |
535+
| ../parquet-testing/data/data_index_bloom_encoding_stats.parquet | 0 | 14 | 1 | 163 | 0 | 4 | 14 | "String" | BYTE_ARRAY | Hello | today | 0 | | Hello | today | GZIP(GzipLevel(6)) | [PLAIN, RLE, BIT_PACKED] | | | 4 | 152 | 163 |
536536
+-----------------------------------------------------------------+--------------+--------------------+-----------------------+-----------------+-----------+-------------+------------+----------------+------------+-----------+-----------+------------------+----------------------+-----------------+-----------------+--------------------+--------------------------+-------------------+------------------------+------------------+-----------------------+-------------------------+
537537
"#);
538538

@@ -592,9 +592,9 @@ mod tests {
592592
+-----------------------------------+-----------------+---------------------+------+------------------+
593593
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
594594
+-----------------------------------+-----------------+---------------------+------+------------------+
595-
| alltypes_plain.parquet | 1851 | 10181 | 2 | page_index=false |
596-
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
597-
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 2 | page_index=false |
595+
| alltypes_plain.parquet | 1851 | 6957 | 2 | page_index=false |
596+
| alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true |
597+
| lz4_raw_compressed_larger.parquet | 380836 | 996 | 2 | page_index=false |
598598
+-----------------------------------+-----------------+---------------------+------+------------------+
599599
");
600600

@@ -623,9 +623,9 @@ mod tests {
623623
+-----------------------------------+-----------------+---------------------+------+------------------+
624624
| filename | file_size_bytes | metadata_size_bytes | hits | extra |
625625
+-----------------------------------+-----------------+---------------------+------+------------------+
626-
| alltypes_plain.parquet | 1851 | 10181 | 5 | page_index=false |
627-
| alltypes_tiny_pages.parquet | 454233 | 881418 | 2 | page_index=true |
628-
| lz4_raw_compressed_larger.parquet | 380836 | 2939 | 3 | page_index=false |
626+
| alltypes_plain.parquet | 1851 | 6957 | 5 | page_index=false |
627+
| alltypes_tiny_pages.parquet | 454233 | 267014 | 2 | page_index=true |
628+
| lz4_raw_compressed_larger.parquet | 380836 | 996 | 3 | page_index=false |
629629
+-----------------------------------+-----------------+---------------------+------+------------------+
630630
");
631631

datafusion-examples/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ serde_json = { workspace = true }
8181
tempfile = { workspace = true }
8282
test-utils = { path = "../test-utils" }
8383
tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] }
84-
tonic = "0.13.1"
84+
tonic = "0.14"
8585
tracing = { version = "0.1" }
8686
tracing-subscriber = { version = "0.3" }
8787
url = { workspace = true }

datafusion-examples/examples/flight/flight_client.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
use std::collections::HashMap;
1919
use std::sync::Arc;
20+
use tonic::transport::Endpoint;
2021

2122
use datafusion::arrow::datatypes::Schema;
2223

@@ -34,7 +35,9 @@ async fn main() -> Result<(), Box<dyn std::error::Error>> {
3435
let testdata = datafusion::test_util::parquet_test_data();
3536

3637
// Create Flight client
37-
let mut client = FlightServiceClient::connect("http://localhost:50051").await?;
38+
let endpoint = Endpoint::new("http://localhost:50051")?;
39+
let channel = endpoint.connect().await?;
40+
let mut client = FlightServiceClient::new(channel);
3841

3942
// Call get_schema to get the schema of a Parquet file
4043
let request = tonic::Request::new(FlightDescriptor {

datafusion-examples/examples/flight/flight_server.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
// specific language governing permissions and limitations
1616
// under the License.
1717

18-
use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator};
18+
use arrow::ipc::writer::{CompressionContext, DictionaryTracker, IpcDataGenerator};
1919
use std::sync::Arc;
2020

2121
use arrow_flight::{PollInfo, SchemaAsIpc};
@@ -106,6 +106,7 @@ impl FlightService for FlightServiceImpl {
106106

107107
// add an initial FlightData message that sends schema
108108
let options = arrow::ipc::writer::IpcWriteOptions::default();
109+
let mut compression_context = CompressionContext::default();
109110
let schema_flight_data = SchemaAsIpc::new(&schema, &options);
110111

111112
let mut flights = vec![FlightData::from(schema_flight_data)];
@@ -115,7 +116,7 @@ impl FlightService for FlightServiceImpl {
115116

116117
for batch in &results {
117118
let (flight_dictionaries, flight_batch) = encoder
118-
.encoded_batch(batch, &mut tracker, &options)
119+
.encode(batch, &mut tracker, &options, &mut compression_context)
119120
.map_err(|e: ArrowError| Status::internal(e.to_string()))?;
120121

121122
flights.extend(flight_dictionaries.into_iter().map(Into::into));

datafusion-examples/examples/parquet_encrypted.rs

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,13 @@
1616
// under the License.
1717

1818
use datafusion::common::DataFusionError;
19-
use datafusion::config::TableParquetOptions;
19+
use datafusion::config::{ConfigFileEncryptionProperties, TableParquetOptions};
2020
use datafusion::dataframe::{DataFrame, DataFrameWriteOptions};
2121
use datafusion::logical_expr::{col, lit};
2222
use datafusion::parquet::encryption::decrypt::FileDecryptionProperties;
2323
use datafusion::parquet::encryption::encrypt::FileEncryptionProperties;
2424
use datafusion::prelude::{ParquetReadOptions, SessionContext};
25+
use std::sync::Arc;
2526
use tempfile::TempDir;
2627

2728
#[tokio::main]
@@ -55,7 +56,7 @@ async fn main() -> datafusion::common::Result<()> {
5556

5657
// Write encrypted parquet
5758
let mut options = TableParquetOptions::default();
58-
options.crypto.file_encryption = Some((&encrypt).into());
59+
options.crypto.file_encryption = Some(ConfigFileEncryptionProperties::from(&encrypt));
5960
parquet_df
6061
.write_parquet(
6162
tempfile_str.as_str(),
@@ -100,7 +101,8 @@ async fn query_dataframe(df: &DataFrame) -> Result<(), DataFusionError> {
100101
// Setup encryption and decryption properties
101102
fn setup_encryption(
102103
parquet_df: &DataFrame,
103-
) -> Result<(FileEncryptionProperties, FileDecryptionProperties), DataFusionError> {
104+
) -> Result<(Arc<FileEncryptionProperties>, Arc<FileDecryptionProperties>), DataFusionError>
105+
{
104106
let schema = parquet_df.schema();
105107
let footer_key = b"0123456789012345".to_vec(); // 128bit/16
106108
let column_key = b"1234567890123450".to_vec(); // 128bit/16

datafusion-examples/examples/parquet_encrypted_with_kms.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -226,7 +226,7 @@ impl EncryptionFactory for TestEncryptionFactory {
226226
options: &EncryptionFactoryOptions,
227227
schema: &SchemaRef,
228228
_file_path: &Path,
229-
) -> Result<Option<FileEncryptionProperties>> {
229+
) -> Result<Option<Arc<FileEncryptionProperties>>> {
230230
let config: EncryptionConfig = options.to_extension_options()?;
231231

232232
// Generate a random encryption key for this file.
@@ -268,7 +268,7 @@ impl EncryptionFactory for TestEncryptionFactory {
268268
&self,
269269
_options: &EncryptionFactoryOptions,
270270
_file_path: &Path,
271-
) -> Result<Option<FileDecryptionProperties>> {
271+
) -> Result<Option<Arc<FileDecryptionProperties>>> {
272272
let decryption_properties =
273273
FileDecryptionProperties::with_key_retriever(Arc::new(TestKeyRetriever {}))
274274
.build()?;

datafusion/common/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ log = { workspace = true }
7171
object_store = { workspace = true, optional = true }
7272
parquet = { workspace = true, optional = true, default-features = true }
7373
paste = "1.0.15"
74-
pyo3 = { version = "0.25", optional = true }
74+
pyo3 = { version = "0.26", optional = true }
7575
recursive = { workspace = true, optional = true }
7676
sqlparser = { workspace = true, optional = true }
7777
tokio = { workspace = true }

0 commit comments

Comments
 (0)