From 7dd96bcebaa10d7c01ccf3b22f98d2aca872ea09 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 08:24:38 -0400 Subject: [PATCH 1/6] Add DataFusion 47.0.0 Upgrade Guide --- docs/source/library-user-guide/upgrading.md | 104 ++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 11fd49566522..b41dc54e1be7 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -19,6 +19,110 @@ # Upgrade Guides +## DataFusion `47.0.0` + +This section calls out some of the major changes in the `47.0.0` release of DataFusion. + +Here are some example upgrade PRs that demonstrate some changed required when upgrading from DataFusion 46.0.0: +* [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378) +* [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563) +* [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434) + +### Upgrades to `arrow` and `parquet` 55.0.0 and `object_store` 0.12.0 + +Several APIs are changed in the underlying arrow and parquet libraries to use a +`u64` instead of `usize` to better support WASM (See [#7371] and [#6961]) + +Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619]) + +[#6619]: https://github.com/apache/arrow-rs/pull/6619 +[#7371]: https://github.com/apache/arrow-rs/pull/7371 +[#7328]: https://github.com/apache/arrow-rs/pull/6961 + +This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as + +```rust +impl Objectstore { + ... + // The range is now a u64 instead of usize + async fn get_range(&self, location: &Path, range: Range) -> ObjectStoreResult { + self.inner.get_range(location, range).await + } + ... + // the lifetime is now 'static instead of `_ (meaning the captured closure can't contain references) + // (this also applies to list_with_offset) + fn list(&self, prefix: Option<&Path>) -> BoxStream<'static, ObjectStoreResult> { + self.inner.list(prefix) + } +} +``` + +The `ParquetObjectReader` has been updated to no longer require the object size +(it can be fetched using a single suffix request). See [#7334] for details + +[#7334]: https://github.com/apache/arrow-rs/pull/7334 + +Pattern in DataFusion `46.0.0`: + +```rust +let meta: ObjectMeta = ...; +let reader = ParquetObjectReader::new(store, meta); +``` + +Pattern in DataFusion `47.0.0`: +```rust +let meta: ObjectMeta = ...; +let reader = ParquetObjectReader::new(store, location) + .with_file_size(meta.size); +``` + +### `DisplayFormatType::TreeRender` + +DataFusion now supports [`tree` style explain plans]. Implementations of +`Executionplan` must also provide a description in the +`DisplayFormatType::TreeRender` format. This can be the same as the existing +`DisplayFormatType::Default`. + +[`tree` style explain plans]: https://datafusion.apache.org/user-guide/sql/explain.html#tree-format-default + +### Removed Deprecated APIs + +Several APIs have been removed in this release. These were either deprecated +previously or were hard to use correctly such as the multiple different +`ScalarUDFImpl::invoke*` APIs. See [#15130], [#15123], and [#15027] for more +details. + +[#15130]: https://github.com/apache/datafusion/pull/15130 +[#15123]: https://github.com/apache/datafusion/pull/15123 +[#15027]: https://github.com/apache/datafusion/pull/15027 + +## `FileScanConfig` --> `FileScanConfigBuilder` + +Previously, `FileScanConfig::build()` directly created ExecutionPlans. In +DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See +[#15352] for details. + +[#15352]: https://github.com/apache/datafusion/pull/15352 + +Pattern in DataFusion `46.0.0`: +```rust +let plan = FileScanConfig::new(url, schema, Arc::new(file_source)) + .with_statistics(stats) + ... + .build() +``` + +Pattern in DataFusion `47.0.0`: +```rust +let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source)) + .with_statistics(stats) + ... + .build(); +let scan = DataSourceExec::from_data_source(config); +``` + + + ## DataFusion `46.0.0` ### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()` From 82df5acd093d1c630c77f8a6090f1e62ff3793fd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 08:27:28 -0400 Subject: [PATCH 2/6] prettier --- docs/source/library-user-guide/upgrading.md | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index b41dc54e1be7..ef93e967ac12 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -24,13 +24,14 @@ This section calls out some of the major changes in the `47.0.0` release of DataFusion. Here are some example upgrade PRs that demonstrate some changed required when upgrading from DataFusion 46.0.0: -* [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378) -* [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563) -* [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434) + +- [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378) +- [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563) +- [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434) ### Upgrades to `arrow` and `parquet` 55.0.0 and `object_store` 0.12.0 -Several APIs are changed in the underlying arrow and parquet libraries to use a +Several APIs are changed in the underlying arrow and parquet libraries to use a `u64` instead of `usize` to better support WASM (See [#7371] and [#6961]) Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been changed to return `static` lifetimes (See [#6619]) @@ -70,6 +71,7 @@ let reader = ParquetObjectReader::new(store, meta); ``` Pattern in DataFusion `47.0.0`: + ```rust let meta: ObjectMeta = ...; let reader = ParquetObjectReader::new(store, location) @@ -100,11 +102,12 @@ details. Previously, `FileScanConfig::build()` directly created ExecutionPlans. In DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See -[#15352] for details. +[#15352] for details. [#15352]: https://github.com/apache/datafusion/pull/15352 Pattern in DataFusion `46.0.0`: + ```rust let plan = FileScanConfig::new(url, schema, Arc::new(file_source)) .with_statistics(stats) @@ -113,6 +116,7 @@ let plan = FileScanConfig::new(url, schema, Arc::new(file_source)) ``` Pattern in DataFusion `47.0.0`: + ```rust let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source)) .with_statistics(stats) @@ -121,8 +125,6 @@ let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source)) let scan = DataSourceExec::from_data_source(config); ``` - - ## DataFusion `46.0.0` ### Use `invoke_with_args` instead of `invoke()` and `invoke_batch()` From 93d0d2b355c37ac81d45f3468c7eeaae13b8884c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 13:09:33 -0400 Subject: [PATCH 3/6] Update docs/source/library-user-guide/upgrading.md Co-authored-by: Oleks V --- docs/source/library-user-guide/upgrading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index ef93e967ac12..daff6bc75e92 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -29,7 +29,7 @@ Here are some example upgrade PRs that demonstrate some changed required when up - [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563) - [Sail Upgrade to `47.0.0`](https://github.com/lakehq/sail/pull/434) -### Upgrades to `arrow` and `parquet` 55.0.0 and `object_store` 0.12.0 +### Upgrades to `arrow-rs` and `arrow-parquet` 55.0.0 and `object_store` 0.12.0 Several APIs are changed in the underlying arrow and parquet libraries to use a `u64` instead of `usize` to better support WASM (See [#7371] and [#6961]) From 153e4f06597e32e84f0c8b4e2c4fa36a9d480cf3 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 13:09:41 -0400 Subject: [PATCH 4/6] Update docs/source/library-user-guide/upgrading.md Co-authored-by: Oleks V --- docs/source/library-user-guide/upgrading.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index daff6bc75e92..04a569f5ae4e 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -23,7 +23,7 @@ This section calls out some of the major changes in the `47.0.0` release of DataFusion. -Here are some example upgrade PRs that demonstrate some changed required when upgrading from DataFusion 46.0.0: +Here are some example upgrade PRs that demonstrate changes required when upgrading from DataFusion 46.0.0: - [delta-rs Upgrade to `47.0.0`](https://github.com/delta-io/delta-rs/pull/3378) - [DataFusion Comet Upgrade to `47.0.0`](https://github.com/apache/datafusion-comet/pull/1563) From e44456f46b4c17bcdccd5f44b14f392125b146dd Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 13:12:48 -0400 Subject: [PATCH 5/6] Fix examples --- docs/source/library-user-guide/upgrading.md | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index 04a569f5ae4e..ccfe769a391c 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -43,6 +43,7 @@ Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been c This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as ```rust +# /* impl Objectstore { ... // The range is now a u64 instead of usize @@ -56,6 +57,7 @@ impl Objectstore { self.inner.list(prefix) } } +# */ ``` The `ParquetObjectReader` has been updated to no longer require the object size @@ -66,16 +68,20 @@ The `ParquetObjectReader` has been updated to no longer require the object size Pattern in DataFusion `46.0.0`: ```rust +# /* let meta: ObjectMeta = ...; let reader = ParquetObjectReader::new(store, meta); +# */ ``` Pattern in DataFusion `47.0.0`: ```rust +# /* let meta: ObjectMeta = ...; let reader = ParquetObjectReader::new(store, location) .with_file_size(meta.size); +# */ ``` ### `DisplayFormatType::TreeRender` From a7a11085697c124df1991af4a14eb96fdfdd1229 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Thu, 17 Apr 2025 16:18:08 -0400 Subject: [PATCH 6/6] Try and fix tests again --- docs/source/library-user-guide/upgrading.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index ccfe769a391c..db5078d603f2 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -43,7 +43,7 @@ Additionally `ObjectStore::list` and `ObjectStore::list_with_offset` have been c This requires converting from `usize` to `u64` occasionally as well as changes to `ObjectStore` implementations such as ```rust -# /* +# /* comment to avoid running impl Objectstore { ... // The range is now a u64 instead of usize @@ -68,7 +68,7 @@ The `ParquetObjectReader` has been updated to no longer require the object size Pattern in DataFusion `46.0.0`: ```rust -# /* +# /* comment to avoid running let meta: ObjectMeta = ...; let reader = ParquetObjectReader::new(store, meta); # */ @@ -77,7 +77,7 @@ let reader = ParquetObjectReader::new(store, meta); Pattern in DataFusion `47.0.0`: ```rust -# /* +# /* comment to avoid running let meta: ObjectMeta = ...; let reader = ParquetObjectReader::new(store, location) .with_file_size(meta.size); @@ -115,20 +115,24 @@ DataFusion 47.0.0 this has been changed to use `FileScanConfigBuilder`. See Pattern in DataFusion `46.0.0`: ```rust +# /* comment to avoid running let plan = FileScanConfig::new(url, schema, Arc::new(file_source)) .with_statistics(stats) ... .build() +# */ ``` Pattern in DataFusion `47.0.0`: ```rust +# /* comment to avoid running let config = FileScanConfigBuilder::new(url, schema, Arc::new(file_source)) .with_statistics(stats) ... .build(); let scan = DataSourceExec::from_data_source(config); +# */ ``` ## DataFusion `46.0.0` @@ -151,7 +155,7 @@ below. See [PR 14876] for an example. Given existing code like this: ```rust -# /* +# /* comment to avoid running impl ScalarUDFImpl for SparkConcat { ... fn invoke_batch(&self, args: &[ColumnarValue], number_rows: usize) -> Result { @@ -171,7 +175,7 @@ impl ScalarUDFImpl for SparkConcat { To ```rust -# /* comment out so they don't run +# /* comment to avoid running impl ScalarUDFImpl for SparkConcat { ... fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result {