From 35795b1910897990d7219bfebeb11764f659eae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?L=C3=ADa=20Adriana?= Date: Thu, 2 Oct 2025 09:56:16 +0200 Subject: [PATCH 1/2] Revert "Revert arrow upgrade and related changes (#50)" This reverts commit 5506e698c41810525331e57766f73e865d86e2a9. --- Cargo.lock | 166 ++-- Cargo.toml | 16 +- datafusion-examples/Cargo.toml | 2 +- datafusion-testing | 2 +- datafusion/common/Cargo.toml | 2 +- datafusion/common/src/config.rs | 16 +- .../common/src/file_options/parquet_writer.rs | 25 +- datafusion/common/src/scalar/mod.rs | 19 +- datafusion/common/src/types/native.rs | 5 +- .../src/datasource/file_format/parquet.rs | 27 +- datafusion/core/tests/fuzz_cases/pruning.rs | 11 +- datafusion/core/tests/parquet/mod.rs | 4 +- .../core/tests/parquet/row_group_pruning.rs | 14 +- .../src/avro_to_arrow/schema.rs | 2 + .../datasource-parquet/src/file_format.rs | 2 +- datafusion/expr/src/utils.rs | 2 + datafusion/physical-plan/src/sorts/cursor.rs | 214 +---- .../proto/datafusion_common.proto | 40 +- datafusion/proto-common/src/from_proto/mod.rs | 31 +- .../proto-common/src/generated/pbjson.rs | 728 +++++++++++++++--- .../proto-common/src/generated/prost.rs | 66 +- datafusion/proto-common/src/to_proto/mod.rs | 18 +- .../src/generated/datafusion_proto_common.rs | 66 +- .../proto/src/logical_plan/file_formats.rs | 14 - datafusion/sql/src/unparser/expr.rs | 6 + datafusion/sqllogictest/test_files/array.slt | 14 +- datafusion/sqllogictest/test_files/copy.slt | 1 - .../test_files/information_schema.slt | 6 +- .../test_files/listing_table_statistics.slt | 2 +- .../test_files/parquet_statistics.slt | 16 +- .../test_files/repartition_scan.slt | 8 +- .../src/logical_plan/consumer/utils.rs | 2 + docs/source/library-user-guide/upgrading.md | 6 + docs/source/user-guide/configs.md | 3 +- parquet-testing | 2 +- testing | 2 +- typos.toml | 1 - 37 files changed, 948 insertions(+), 613 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e58f72ee7b2f..5186c6d6d598 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -240,9 +240,9 @@ checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50" [[package]] name = "arrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3f15b4c6b148206ff3a2b35002e08929c2462467b62b9c02036d9c34f9ef994" +checksum = "fd798aea3553913a5986813e9c6ad31a2d2b04e931fe8ea4a37155eb541cebb5" dependencies = [ "arrow-arith", "arrow-array", @@ -264,9 +264,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30feb679425110209ae35c3fbf82404a39a4c0436bb3ec36164d8bffed2a4ce4" +checksum = "508dafb53e5804a238cab7fd97a59ddcbfab20cc4d9814b1ab5465b9fa147f2e" dependencies = [ "arrow-array", "arrow-buffer", @@ -278,9 +278,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "70732f04d285d49054a48b72c54f791bb3424abae92d27aafdf776c98af161c8" +checksum = "e2730bc045d62bb2e53ef8395b7d4242f5c8102f41ceac15e8395b9ac3d08461" dependencies = [ "ahash 0.8.12", "arrow-buffer", @@ -295,9 +295,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "169b1d5d6cb390dd92ce582b06b23815c7953e9dfaaea75556e89d890d19993d" +checksum = "54295b93beb702ee9a6f6fbced08ad7f4d76ec1c297952d4b83cf68755421d1d" dependencies = [ "bytes", "half", @@ -306,9 +306,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4f12eccc3e1c05a766cafb31f6a60a46c2f8efec9b74c6e0648766d30686af8" +checksum = "67e8bcb7dc971d779a7280593a1bf0c2743533b8028909073e804552e85e75b5" dependencies = [ "arrow-array", "arrow-buffer", @@ -327,9 +327,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "012c9fef3f4a11573b2c74aec53712ff9fdae4a95f4ce452d1bbf088ee00f06b" +checksum = "673fd2b5fb57a1754fdbfac425efd7cf54c947ac9950c1cce86b14e248f1c458" dependencies = [ "arrow-array", "arrow-cast", @@ -342,9 +342,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8de1ce212d803199684b658fc4ba55fb2d7e87b213de5af415308d2fee3619c2" +checksum = "97c22fe3da840039c69e9f61f81e78092ea36d57037b4900151f063615a2f6b4" dependencies = [ "arrow-buffer", "arrow-schema", @@ -354,9 +354,9 @@ dependencies = [ [[package]] name = "arrow-flight" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5cb3e1d2b441e6d1d5988e3f7c4523c9466b18ef77d7c525d92d36d4cad49fbe" +checksum = "6808d235786b721e49e228c44dd94242f2e8b46b7e95b233b0733c46e758bfee" dependencies = [ "arrow-arith", "arrow-array", @@ -381,9 +381,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9ea5967e8b2af39aff5d9de2197df16e305f47f404781d3230b2dc672da5d92" +checksum = "778de14c5a69aedb27359e3dd06dd5f9c481d5f6ee9fbae912dba332fd64636b" dependencies = [ "arrow-array", "arrow-buffer", @@ -396,9 +396,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5709d974c4ea5be96d900c01576c7c0b99705f4a3eec343648cb1ca863988a9c" +checksum = "3860db334fe7b19fcf81f6b56f8d9d95053f3839ffe443d56b5436f7a29a1794" dependencies = [ "arrow-array", "arrow-buffer", @@ -418,9 +418,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6506e3a059e3be23023f587f79c82ef0bcf6d293587e3272d20f2d30b969b5a7" +checksum = "425fa0b42a39d3ff55160832e7c25553e7f012c3f187def3d70313e7a29ba5d9" dependencies = [ "arrow-array", "arrow-buffer", @@ -431,9 +431,9 @@ dependencies = [ [[package]] name = "arrow-pyarrow" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e55ecf16b9b61d433f6e63c72fc6afcf2597d7db96583de88ebb887d1822268" +checksum = "d944d8ae9b77230124e6570865b570416c33a5809f32c4136c679bbe774e45c9" dependencies = [ "arrow-array", "arrow-data", @@ -443,9 +443,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52bf7393166beaf79b4bed9bfdf19e97472af32ce5b6b48169d321518a08cae2" +checksum = "df9c9423c9e71abd1b08a7f788fcd203ba2698ac8e72a1f236f1faa1a06a7414" dependencies = [ "arrow-array", "arrow-buffer", @@ -456,9 +456,9 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af7686986a3bf2254c9fb130c623cdcb2f8e1f15763e7c71c310f0834da3d292" +checksum = "85fa1babc4a45fdc64a92175ef51ff00eba5ebbc0007962fecf8022ac1c6ce28" dependencies = [ "bitflags 2.9.1", "serde", @@ -467,9 +467,9 @@ dependencies = [ [[package]] name = "arrow-select" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd2b45757d6a2373faa3352d02ff5b54b098f5e21dccebc45a21806bc34501e5" +checksum = "d8854d15f1cf5005b4b358abeb60adea17091ff5bdd094dca5d3f73787d81170" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -481,9 +481,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0377d532850babb4d927a06294314b316e23311503ed580ec6ce6a0158f49d40" +checksum = "2c477e8b89e1213d5927a2a84a72c384a9bf4dd0dbf15f9fd66d821aafd9e95e" dependencies = [ "arrow-array", "arrow-buffer", @@ -561,28 +561,6 @@ dependencies = [ "syn 2.0.106", ] -[[package]] -name = "async-stream" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476" -dependencies = [ - "async-stream-impl", - "futures-core", - "pin-project-lite", -] - -[[package]] -name = "async-stream-impl" -version = "0.3.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.106", -] - [[package]] name = "async-trait" version = "0.1.89" @@ -844,7 +822,7 @@ dependencies = [ "rustls-pki-types", "tokio", "tokio-rustls", - "tower 0.5.2", + "tower", "tracing", ] @@ -965,11 +943,10 @@ dependencies = [ [[package]] name = "axum" -version = "0.7.9" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edca88bc138befd0323b20752846e6587272d3b03b0343c8ea28a6f819e6e71f" +checksum = "021e862c184ae977658b36c4500f7feac3221ca5da43e3f25bd04ab6c79a29b5" dependencies = [ - "async-trait", "axum-core", "bytes", "futures-util", @@ -985,20 +962,19 @@ dependencies = [ "rustversion", "serde", "sync_wrapper", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] [[package]] name = "axum-core" -version = "0.4.5" +version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +checksum = "68464cd0412f486726fb3373129ef5d2993f90c34bc2bc1c1e9943b2f4fc7ca6" dependencies = [ - "async-trait", "bytes", - "futures-util", + "futures-core", "http 1.3.1", "http-body 1.0.1", "http-body-util", @@ -4190,9 +4166,9 @@ dependencies = [ [[package]] name = "matchit" -version = "0.7.3" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e7465ac9959cc2b1404e8e2367b43684a6d13790fe23056cc8c6c5a6b7bcb94" +checksum = "47e1ffaa40ddd1f3ed91f717a33c8c0ee23fff369e3aa8772b9605cc1d22f4c3" [[package]] name = "md-5" @@ -4553,9 +4529,9 @@ dependencies = [ [[package]] name = "parquet" -version = "55.2.0" +version = "56.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b17da4150748086bd43352bc77372efa9b6e3dbd06a04831d2a98c041c225cfa" +checksum = "c7288a07ed5d25939a90f9cb1ca5afa6855faa08ec7700613511ae64bdb0620c" dependencies = [ "ahash 0.8.12", "arrow-array", @@ -5052,11 +5028,10 @@ dependencies = [ [[package]] name = "pyo3" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e5203598f366b11a02b13aa20cab591229ff0a89fd121a308a5df751d5fc9219" +checksum = "8970a78afe0628a3e3430376fc5fd76b6b45c4d43360ffd6cdd40bdde72b682a" dependencies = [ - "cfg-if", "indoc", "libc", "memoffset", @@ -5070,9 +5045,9 @@ dependencies = [ [[package]] name = "pyo3-build-config" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99636d423fa2ca130fa5acde3059308006d46f98caac629418e53f7ebb1e9999" +checksum = "458eb0c55e7ece017adeba38f2248ff3ac615e53660d7c71a238d7d2a01c7598" dependencies = [ "once_cell", "target-lexicon", @@ -5080,9 +5055,9 @@ dependencies = [ [[package]] name = "pyo3-ffi" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78f9cf92ba9c409279bc3305b5409d90db2d2c22392d443a87df3a1adad59e33" +checksum = "7114fe5457c61b276ab77c5055f206295b812608083644a5c5b2640c3102565c" dependencies = [ "libc", "pyo3-build-config", @@ -5090,9 +5065,9 @@ dependencies = [ [[package]] name = "pyo3-macros" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b999cb1a6ce21f9a6b147dcf1be9ffedf02e0043aec74dc390f3007047cecd9" +checksum = "a8725c0a622b374d6cb051d11a0983786448f7785336139c3c94f5aa6bef7e50" dependencies = [ "proc-macro2", "pyo3-macros-backend", @@ -5102,9 +5077,9 @@ dependencies = [ [[package]] name = "pyo3-macros-backend" -version = "0.24.2" +version = "0.25.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "822ece1c7e1012745607d5cf0bcb2874769f0f7cb34c4cde03b9358eb9ef911a" +checksum = "4109984c22491085343c05b0dbc54ddc405c3cf7b4374fc533f5c3313a572ccc" dependencies = [ "heck 0.5.0", "proc-macro2", @@ -5474,7 +5449,7 @@ dependencies = [ "tokio", "tokio-rustls", "tokio-util", - "tower 0.5.2", + "tower", "tower-http", "tower-service", "url", @@ -6679,11 +6654,10 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.3" +version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" +checksum = "7e581ba15a835f4d9ea06c55ab1bd4dce26fc53752c69a04aac00703bfb49ba9" dependencies = [ - "async-stream", "async-trait", "axum", "base64 0.22.1", @@ -6701,27 +6675,7 @@ dependencies = [ "socket2 0.5.10", "tokio", "tokio-stream", - "tower 0.4.13", - "tower-layer", - "tower-service", - "tracing", -] - -[[package]] -name = "tower" -version = "0.4.13" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c" -dependencies = [ - "futures-core", - "futures-util", - "indexmap 1.9.3", - "pin-project", - "pin-project-lite", - "rand 0.8.5", - "slab", - "tokio", - "tokio-util", + "tower", "tower-layer", "tower-service", "tracing", @@ -6735,11 +6689,15 @@ checksum = "d039ad9159c98b70ecfd540b2573b97f7f52c3e8d9f8ad57a24b916a536975f9" dependencies = [ "futures-core", "futures-util", + "indexmap 2.11.0", "pin-project-lite", + "slab", "sync_wrapper", "tokio", + "tokio-util", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -6755,7 +6713,7 @@ dependencies = [ "http-body 1.0.1", "iri-string", "pin-project-lite", - "tower 0.5.2", + "tower", "tower-layer", "tower-service", ] diff --git a/Cargo.toml b/Cargo.toml index 6f7c61268e0e..53c35ed35f0d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -90,20 +90,20 @@ ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } apache-avro = { version = "0.20", default-features = false } -arrow = { version = "55.2.0", features = [ +arrow = { version = "56.0.0", features = [ "prettyprint", "chrono-tz", ] } -arrow-buffer = { version = "55.2.0", default-features = false } -arrow-flight = { version = "55.2.0", features = [ +arrow-buffer = { version = "56.0.0", default-features = false } +arrow-flight = { version = "56.0.0", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "55.2.0", default-features = false, features = [ +arrow-ipc = { version = "56.0.0", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "55.2.0", default-features = false } -arrow-schema = { version = "55.2.0", default-features = false } -async-trait = "0.1.88" +arrow-ord = { version = "56.0.0", default-features = false } +arrow-schema = { version = "56.0.0", default-features = false } +async-trait = "0.1.89" bigdecimal = "0.4.8" bytes = "1.10" chrono = { version = "0.4.41", default-features = false } @@ -157,7 +157,7 @@ itertools = "0.14" log = "^0.4" object_store = { version = "0.12.3", default-features = false } parking_lot = "0.12" -parquet = { version = "55.2.0", default-features = false, features = [ +parquet = { version = "56.0.0", default-features = false, features = [ "arrow", "async", "object_store", diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 204fc8794fbb..68bb5376a1ac 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -81,7 +81,7 @@ serde_json = { workspace = true } tempfile = { workspace = true } test-utils = { path = "../test-utils" } tokio = { workspace = true, features = ["rt-multi-thread", "parking_lot"] } -tonic = "0.12.1" +tonic = "0.13.1" tracing = { version = "0.1" } tracing-subscriber = { version = "0.3" } url = { workspace = true } diff --git a/datafusion-testing b/datafusion-testing index 905df5f65cc9..f72ac4075ada 160000 --- a/datafusion-testing +++ b/datafusion-testing @@ -1 +1 @@ -Subproject commit 905df5f65cc9d0851719c21f5a4dd5cd77621f19 +Subproject commit f72ac4075ada5ea9810551bc0c3e3161c61204a2 diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 6350c29b46de..aea5e51befb0 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -71,7 +71,7 @@ log = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.24.2", optional = true } +pyo3 = { version = "0.25", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 876d419b971f..cdd8e72a06cc 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -602,13 +602,6 @@ config_namespace! { /// default parquet writer setting pub statistics_enabled: Option, transform = str::to_lowercase, default = Some("page".into()) - /// (writing) Sets max statistics size for any column. If NULL, uses - /// default parquet writer setting - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option, default = Some(4096) - /// (writing) Target maximum number of rows in each row group (defaults to 1M /// rows). Writing larger row groups requires more memory to write, but /// can get better compression and be faster to read. @@ -622,7 +615,7 @@ config_namespace! { /// (writing) Sets statistics truncate length. If NULL, uses /// default parquet writer setting - pub statistics_truncate_length: Option, default = None + pub statistics_truncate_length: Option, default = Some(64) /// (writing) Sets best effort maximum number of rows in data page pub data_page_row_count_limit: usize, default = 20_000 @@ -2141,13 +2134,6 @@ config_namespace_with_hashmap! { /// Sets bloom filter number of distinct values. If NULL, uses /// default parquet options pub bloom_filter_ndv: Option, default = None - - /// Sets max statistics size for the column path. If NULL, uses - /// default parquet options - /// max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[deprecated(since = "45.0.0", note = "Setting does not do anything")] - pub max_statistics_size: Option, default = None } } diff --git a/datafusion/common/src/file_options/parquet_writer.rs b/datafusion/common/src/file_options/parquet_writer.rs index a59cd3cb7554..185826aef47d 100644 --- a/datafusion/common/src/file_options/parquet_writer.rs +++ b/datafusion/common/src/file_options/parquet_writer.rs @@ -35,7 +35,7 @@ use parquet::{ metadata::KeyValue, properties::{ EnabledStatistics, WriterProperties, WriterPropertiesBuilder, WriterVersion, - DEFAULT_MAX_STATISTICS_SIZE, DEFAULT_STATISTICS_ENABLED, + DEFAULT_STATISTICS_ENABLED, }, }, schema::types::ColumnPath, @@ -160,16 +160,6 @@ impl TryFrom<&TableParquetOptions> for WriterPropertiesBuilder { builder = builder.set_column_bloom_filter_ndv(path.clone(), bloom_filter_ndv); } - - // max_statistics_size is deprecated, currently it is not being used - // TODO: remove once deprecated - #[allow(deprecated)] - if let Some(max_statistics_size) = options.max_statistics_size { - builder = { - #[allow(deprecated)] - builder.set_column_max_statistics_size(path, max_statistics_size) - } - } } Ok(builder) @@ -218,7 +208,6 @@ impl ParquetOptions { dictionary_enabled, dictionary_page_size_limit, statistics_enabled, - max_statistics_size, max_row_group_size, created_by, column_index_truncate_length, @@ -264,13 +253,6 @@ impl ParquetOptions { .set_data_page_row_count_limit(*data_page_row_count_limit) .set_bloom_filter_enabled(*bloom_filter_on_write); - builder = { - #[allow(deprecated)] - builder.set_max_statistics_size( - max_statistics_size.unwrap_or(DEFAULT_MAX_STATISTICS_SIZE), - ) - }; - if let Some(bloom_filter_fpp) = bloom_filter_fpp { builder = builder.set_bloom_filter_fpp(*bloom_filter_fpp); }; @@ -463,12 +445,10 @@ mod tests { fn column_options_with_non_defaults( src_col_defaults: &ParquetOptions, ) -> ParquetColumnOptions { - #[allow(deprecated)] // max_statistics_size ParquetColumnOptions { compression: Some("zstd(22)".into()), dictionary_enabled: src_col_defaults.dictionary_enabled.map(|v| !v), statistics_enabled: Some("none".into()), - max_statistics_size: Some(72), encoding: Some("RLE".into()), bloom_filter_enabled: Some(true), bloom_filter_fpp: Some(0.72), @@ -493,7 +473,6 @@ mod tests { dictionary_enabled: Some(!defaults.dictionary_enabled.unwrap_or(false)), dictionary_page_size_limit: 42, statistics_enabled: Some("chunk".into()), - max_statistics_size: Some(42), max_row_group_size: 42, created_by: "wordy".into(), column_index_truncate_length: Some(42), @@ -551,7 +530,6 @@ mod tests { ), bloom_filter_fpp: bloom_filter_default_props.map(|p| p.fpp), bloom_filter_ndv: bloom_filter_default_props.map(|p| p.ndv), - max_statistics_size: Some(props.max_statistics_size(&col)), } } @@ -608,7 +586,6 @@ mod tests { compression: default_col_props.compression, dictionary_enabled: default_col_props.dictionary_enabled, statistics_enabled: default_col_props.statistics_enabled, - max_statistics_size: default_col_props.max_statistics_size, bloom_filter_on_write: default_col_props .bloom_filter_enabled .unwrap_or_default(), diff --git a/datafusion/common/src/scalar/mod.rs b/datafusion/common/src/scalar/mod.rs index 452d6c13b3e0..4d88f5a66732 100644 --- a/datafusion/common/src/scalar/mod.rs +++ b/datafusion/common/src/scalar/mod.rs @@ -904,11 +904,10 @@ pub fn dict_from_values( .map(|index| { if values_array.is_valid(index) { let native_index = K::Native::from_usize(index).ok_or_else(|| { - DataFusionError::Internal(format!( - "Can not create index of type {} from value {}", - K::DATA_TYPE, - index - )) + _internal_datafusion_err!( + "Can not create index of type {} from value {index}", + K::DATA_TYPE + ) })?; Ok(Some(native_index)) } else { @@ -2203,6 +2202,16 @@ impl ScalarValue { } let array: ArrayRef = match &data_type { + DataType::Decimal32(_precision, _scale) => { + return _not_impl_err!( + "Decimal32 not supported in ScalarValue::iter_to_array" + ); + } + DataType::Decimal64(_precision, _scale) => { + return _not_impl_err!( + "Decimal64 not supported in ScalarValue::iter_to_array" + ); + } DataType::Decimal128(precision, scale) => { let decimal_array = ScalarValue::iter_to_decimal_array(scalars, *precision, *scale)?; diff --git a/datafusion/common/src/types/native.rs b/datafusion/common/src/types/native.rs index 39c79b4b9974..76629e555b8c 100644 --- a/datafusion/common/src/types/native.rs +++ b/datafusion/common/src/types/native.rs @@ -407,7 +407,10 @@ impl From for NativeType { DataType::Union(union_fields, _) => { Union(LogicalUnionFields::from(&union_fields)) } - DataType::Decimal128(p, s) | DataType::Decimal256(p, s) => Decimal(p, s), + DataType::Decimal32(p, s) + | DataType::Decimal64(p, s) + | DataType::Decimal128(p, s) + | DataType::Decimal256(p, s) => Decimal(p, s), DataType::Map(field, _) => Map(Arc::new(field.as_ref().into())), DataType::Dictionary(_, data_type) => data_type.as_ref().clone().into(), DataType::RunEndEncoded(_, field) => field.data_type().clone().into(), diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index db704bee8801..088c4408fff5 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -523,11 +523,23 @@ mod tests { let dic_array = DictionaryArray::::try_new(keys, Arc::new(values))?; let c_dic: ArrayRef = Arc::new(dic_array); - let batch1 = RecordBatch::try_from_iter(vec![("c_dic", c_dic)])?; + // Data for column string_truncation: ["a".repeat(128), null, "b".repeat(128), null] + let string_truncation: ArrayRef = Arc::new(StringArray::from(vec![ + Some("a".repeat(128)), + None, + Some("b".repeat(128)), + None, + ])); + + let batch1 = RecordBatch::try_from_iter(vec![ + ("c_dic", c_dic), + ("string_truncation", string_truncation), + ])?; // Use store_parquet to write each batch to its own file // . batch1 written into first file and includes: // - column c_dic that has 4 rows with no null. Stats min and max of dictionary column is available. + // - column string_truncation that has 4 rows with 2 nulls. Stats min and max of string column is available but not exact. let store = Arc::new(RequestCountingObjectStore::new(Arc::new( LocalFileSystem::new(), ))); @@ -563,6 +575,19 @@ mod tests { Precision::Exact(Utf8(Some("a".into()))) ); + // column string_truncation + let string_truncation_stats = &stats.column_statistics[1]; + + assert_eq!(string_truncation_stats.null_count, Precision::Exact(2)); + assert_eq!( + string_truncation_stats.max_value, + Precision::Inexact(ScalarValue::Utf8View(Some("b".repeat(63) + "c"))) + ); + assert_eq!( + string_truncation_stats.min_value, + Precision::Inexact(ScalarValue::Utf8View(Some("a".repeat(64)))) + ); + Ok(()) } diff --git a/datafusion/core/tests/fuzz_cases/pruning.rs b/datafusion/core/tests/fuzz_cases/pruning.rs index 4c0073b94be7..f8bd4dbc1a76 100644 --- a/datafusion/core/tests/fuzz_cases/pruning.rs +++ b/datafusion/core/tests/fuzz_cases/pruning.rs @@ -319,14 +319,9 @@ async fn write_parquet_file( row_groups: Vec>, ) -> Bytes { let mut buf = BytesMut::new().writer(); - let mut props = WriterProperties::builder(); - if let Some(truncation_length) = truncation_length { - props = { - #[allow(deprecated)] - props.set_max_statistics_size(truncation_length) - } - } - props = props.set_statistics_enabled(EnabledStatistics::Chunk); // row group level + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Chunk) // row group level + .set_statistics_truncate_length(truncation_length); let props = props.build(); { let mut writer = diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index d0ddd9a0b0f1..c44d14abd381 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -110,11 +110,11 @@ struct ContextWithParquet { /// The output of running one of the test cases struct TestOutput { - /// The input string + /// The input query SQL sql: String, /// Execution metrics for the Parquet Scan parquet_metrics: MetricsSet, - /// number of rows in results + /// number of actual rows in results result_rows: usize, /// the contents of the input, as a string pretty_input: String, diff --git a/datafusion/core/tests/parquet/row_group_pruning.rs b/datafusion/core/tests/parquet/row_group_pruning.rs index 8613cd481be1..44409166d3ce 100644 --- a/datafusion/core/tests/parquet/row_group_pruning.rs +++ b/datafusion/core/tests/parquet/row_group_pruning.rs @@ -34,7 +34,7 @@ struct RowGroupPruningTest { expected_files_pruned_by_statistics: Option, expected_row_group_matched_by_bloom_filter: Option, expected_row_group_pruned_by_bloom_filter: Option, - expected_results: usize, + expected_rows: usize, } impl RowGroupPruningTest { // Start building the test configuration @@ -48,7 +48,7 @@ impl RowGroupPruningTest { expected_files_pruned_by_statistics: None, expected_row_group_matched_by_bloom_filter: None, expected_row_group_pruned_by_bloom_filter: None, - expected_results: 0, + expected_rows: 0, } } @@ -99,9 +99,9 @@ impl RowGroupPruningTest { self } - // Set the expected rows for the test + /// Set the number of expected rows from the output of this test fn with_expected_rows(mut self, rows: usize) -> Self { - self.expected_results = rows; + self.expected_rows = rows; self } @@ -145,8 +145,10 @@ impl RowGroupPruningTest { ); assert_eq!( output.result_rows, - self.expected_results, - "mismatched expected rows: {}", + self.expected_rows, + "Expected {} rows, got {}: {}", + output.result_rows, + self.expected_rows, output.description(), ); } diff --git a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs index 39d93d2f193e..3fce0d4826a2 100644 --- a/datafusion/datasource-avro/src/avro_to_arrow/schema.rs +++ b/datafusion/datasource-avro/src/avro_to_arrow/schema.rs @@ -238,6 +238,8 @@ fn default_field_name(dt: &DataType) -> &str { | DataType::LargeListView(_) => { unimplemented!("View support not implemented") } + DataType::Decimal32(_, _) => "decimal", + DataType::Decimal64(_, _) => "decimal", DataType::Decimal128(_, _) => "decimal", DataType::Decimal256(_, _) => "decimal", } diff --git a/datafusion/datasource-parquet/src/file_format.rs b/datafusion/datasource-parquet/src/file_format.rs index 1ec2015ce549..1fcc1721017c 100644 --- a/datafusion/datasource-parquet/src/file_format.rs +++ b/datafusion/datasource-parquet/src/file_format.rs @@ -19,11 +19,11 @@ use std::any::Any; use std::cell::RefCell; -use std::fmt; use std::fmt::Debug; use std::ops::Range; use std::rc::Rc; use std::sync::Arc; +use std::{fmt, vec}; use arrow::array::RecordBatch; use arrow::datatypes::{Fields, Schema, SchemaRef, TimeUnit}; diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 684eba59536b..2e364d0d2b80 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -814,6 +814,8 @@ pub fn can_hash(data_type: &DataType) -> bool { DataType::Float16 => true, DataType::Float32 => true, DataType::Float64 => true, + DataType::Decimal32(_, _) => true, + DataType::Decimal64(_, _) => true, DataType::Decimal128(_, _) => true, DataType::Decimal256(_, _) => true, DataType::Timestamp(_, _) => true, diff --git a/datafusion/physical-plan/src/sorts/cursor.rs b/datafusion/physical-plan/src/sorts/cursor.rs index 8ab603e04961..54dc2414e4f0 100644 --- a/datafusion/physical-plan/src/sorts/cursor.rs +++ b/datafusion/physical-plan/src/sorts/cursor.rs @@ -289,120 +289,6 @@ impl CursorArray for StringViewArray { } } -/// Todo use arrow-rs side api after: and released -/// Builds a 128-bit composite key for an inline value: -/// -/// - High 96 bits: the inline data in big-endian byte order (for correct lexicographical sorting). -/// - Low 32 bits: the length in big-endian byte order, acting as a tiebreaker so shorter strings -/// (or those with fewer meaningful bytes) always numerically sort before longer ones. -/// -/// This function extracts the length and the 12-byte inline string data from the raw -/// little-endian `u128` representation, converts them to big-endian ordering, and packs them -/// into a single `u128` value suitable for fast, branchless comparisons. -/// -/// # Why include length? -/// -/// A pure 96-bit content comparison can’t distinguish between two values whose inline bytes -/// compare equal—either because one is a true prefix of the other or because zero-padding -/// hides extra bytes. By tucking the 32-bit length into the lower bits, a single `u128` compare -/// handles both content and length in one go. -/// -/// Example: comparing "bar" (3 bytes) vs "bar\0" (4 bytes) -/// -/// | String | Bytes 0–4 (length LE) | Bytes 4–16 (data + padding) | -/// |------------|-----------------------|---------------------------------| -/// | `"bar"` | `03 00 00 00` | `62 61 72` + 9 × `00` | -/// | `"bar\0"`| `04 00 00 00` | `62 61 72 00` + 8 × `00` | -/// -/// Both inline parts become `62 61 72 00…00`, so they tie on content. The length field -/// then differentiates: -/// -/// ```text -/// key("bar") = 0x0000000000000000000062617200000003 -/// key("bar\0") = 0x0000000000000000000062617200000004 -/// ⇒ key("bar") < key("bar\0") -/// ``` -/// # Inlining and Endianness -/// -/// - We start by calling `.to_le_bytes()` on the `raw` `u128`, because Rust’s native in‑memory -/// representation is little‑endian on x86/ARM. -/// - We extract the low 32 bits numerically (`raw as u32`)—this step is endianness‑free. -/// - We copy the 12 bytes of inline data (original order) into `buf[0..12]`. -/// - We serialize `length` as big‑endian into `buf[12..16]`. -/// - Finally, `u128::from_be_bytes(buf)` treats `buf[0]` as the most significant byte -/// and `buf[15]` as the least significant, producing a `u128` whose integer value -/// directly encodes “inline data then length” in big‑endian form. -/// -/// This ensures that a simple `u128` comparison is equivalent to the desired -/// lexicographical comparison of the inline bytes followed by length. -#[inline(always)] -pub fn inline_key_fast(raw: u128) -> u128 { - // 1. Decompose `raw` into little‑endian bytes: - // - raw_bytes[0..4] = length in LE - // - raw_bytes[4..16] = inline string data - let raw_bytes = raw.to_le_bytes(); - - // 2. Numerically truncate to get the low 32‑bit length (endianness‑free). - let length = raw as u32; - - // 3. Build a 16‑byte buffer in big‑endian order: - // - buf[0..12] = inline string bytes (in original order) - // - buf[12..16] = length.to_be_bytes() (BE) - let mut buf = [0u8; 16]; - buf[0..12].copy_from_slice(&raw_bytes[4..16]); // inline data - - // Why convert length to big-endian for comparison? - // - // Rust (on most platforms) stores integers in little-endian format, - // meaning the least significant byte is at the lowest memory address. - // For example, an u32 value like 0x22345677 is stored in memory as: - // - // [0x77, 0x56, 0x34, 0x22] // little-endian layout - // ^ ^ ^ ^ - // LSB ↑↑↑ MSB - // - // This layout is efficient for arithmetic but *not* suitable for - // lexicographic (dictionary-style) comparison of byte arrays. - // - // To compare values by byte order—e.g., for sorted keys or binary trees— - // we must convert them to **big-endian**, where: - // - // - The most significant byte (MSB) comes first (index 0) - // - The least significant byte (LSB) comes last (index N-1) - // - // In big-endian, the same u32 = 0x22345677 would be represented as: - // - // [0x22, 0x34, 0x56, 0x77] - // - // This ordering aligns with natural string/byte sorting, so calling - // `.to_be_bytes()` allows us to construct - // keys where standard numeric comparison (e.g., `<`, `>`) behaves - // like lexicographic byte comparison. - buf[12..16].copy_from_slice(&length.to_be_bytes()); // length in BE - - // 4. Deserialize the buffer as a big‑endian u128: - // buf[0] is MSB, buf[15] is LSB. - // Details: - // Note on endianness and layout: - // - // Although `buf[0]` is stored at the lowest memory address, - // calling `u128::from_be_bytes(buf)` interprets it as the **most significant byte (MSB)**, - // and `buf[15]` as the **least significant byte (LSB)**. - // - // This is the core principle of **big-endian decoding**: - // - Byte at index 0 maps to bits 127..120 (highest) - // - Byte at index 1 maps to bits 119..112 - // - ... - // - Byte at index 15 maps to bits 7..0 (lowest) - // - // So even though memory layout goes from low to high (left to right), - // big-endian treats the **first byte** as highest in value. - // - // This guarantees that comparing two `u128` keys is equivalent to lexicographically - // comparing the original inline bytes, followed by length. - u128::from_be_bytes(buf) -} - impl CursorValues for StringViewArray { fn len(&self) -> usize { self.views().len() @@ -460,7 +346,8 @@ impl CursorValues for StringViewArray { if l.data_buffers().is_empty() && r.data_buffers().is_empty() { let l_view = unsafe { l.views().get_unchecked(l_idx) }; let r_view = unsafe { r.views().get_unchecked(r_idx) }; - return inline_key_fast(*l_view).cmp(&inline_key_fast(*r_view)); + return StringViewArray::inline_key_fast(*l_view) + .cmp(&StringViewArray::inline_key_fast(*r_view)); } unsafe { GenericByteViewArray::compare_unchecked(l, l_idx, r, r_idx) } @@ -555,7 +442,6 @@ impl CursorValues for ArrayValues { #[cfg(test)] mod tests { - use arrow::array::GenericBinaryArray; use datafusion_execution::memory_pool::{ GreedyMemoryPool, MemoryConsumer, MemoryPool, }; @@ -720,100 +606,4 @@ mod tests { b.advance(); assert_eq!(a.cmp(&b), Ordering::Less); } - - /// Integration tests for `inline_key_fast` covering: - /// - /// 1. Monotonic ordering across increasing lengths and lexical variations. - /// 2. Cross-check against `GenericBinaryArray` comparison to ensure semantic equivalence. - /// - /// This also includes a specific test for the “bar” vs. “bar\0” case, demonstrating why - /// the length field is required even when all inline bytes fit in 12 bytes. - /// - /// The test includes strings that verify correct byte order (prevent reversal bugs), - /// and length-based tie-breaking in the composite key. - /// - /// The test confirms that `inline_key_fast` produces keys which sort consistently - /// with the expected lexicographical order of the raw byte arrays. - #[test] - fn test_inline_key_fast_various_lengths_and_lexical() { - /// Helper to create a raw u128 value representing an inline ByteView: - /// - `length`: number of meaningful bytes (must be ≤ 12) - /// - `data`: the actual inline data bytes - /// - /// The first 4 bytes encode length in little-endian, - /// the following 12 bytes contain the inline string data (unpadded). - fn make_raw_inline(length: u32, data: &[u8]) -> u128 { - assert!(length as usize <= 12, "Inline length must be ≤ 12"); - assert!( - data.len() == length as usize, - "Data length must match `length`" - ); - - let mut raw_bytes = [0u8; 16]; - raw_bytes[0..4].copy_from_slice(&length.to_le_bytes()); // length stored little-endian - raw_bytes[4..(4 + data.len())].copy_from_slice(data); // inline data - u128::from_le_bytes(raw_bytes) - } - - // Test inputs: various lengths and lexical orders, - // plus special cases for byte order and length tie-breaking - let test_inputs: Vec<&[u8]> = vec![ - b"a", - b"aa", - b"aaa", - b"aab", - b"abcd", - b"abcde", - b"abcdef", - b"abcdefg", - b"abcdefgh", - b"abcdefghi", - b"abcdefghij", - b"abcdefghijk", - b"abcdefghijkl", - // Tests for byte-order reversal bug: - // Without the fix, "backend one" would compare as "eno dnekcab", - // causing incorrect sort order relative to "backend two". - b"backend one", - b"backend two", - // Tests length-tiebreaker logic: - // "bar" (3 bytes) and "bar\0" (4 bytes) have identical inline data, - // so only the length differentiates their ordering. - b"bar", - b"bar\0", - // Additional lexical and length tie-breaking cases with same prefix, in correct lex order: - b"than12Byt", - b"than12Bytes", - b"than12Bytes\0", - b"than12Bytesx", - b"than12Bytex", - b"than12Bytez", - // Additional lexical tests - b"xyy", - b"xyz", - b"xza", - ]; - - // Create a GenericBinaryArray for cross-comparison of lex order - let array: GenericBinaryArray = GenericBinaryArray::from( - test_inputs.iter().map(|s| Some(*s)).collect::>(), - ); - - for i in 0..array.len() - 1 { - let v1 = array.value(i); - let v2 = array.value(i + 1); - - // Assert the array's natural lexical ordering is correct - assert!(v1 < v2, "Array compare failed: {v1:?} !< {v2:?}"); - - // Assert the keys produced by inline_key_fast reflect the same ordering - let key1 = inline_key_fast(make_raw_inline(v1.len() as u32, v1)); - let key2 = inline_key_fast(make_raw_inline(v2.len() as u32, v2)); - - assert!( - key1 < key2, - "Key compare failed: key({v1:?})=0x{key1:032x} !< key({v2:?})=0x{key2:032x}", - ); - } - } } diff --git a/datafusion/proto-common/proto/datafusion_common.proto b/datafusion/proto-common/proto/datafusion_common.proto index 8cb272605899..f5c79cf3d9a4 100644 --- a/datafusion/proto-common/proto/datafusion_common.proto +++ b/datafusion/proto-common/proto/datafusion_common.proto @@ -136,7 +136,19 @@ enum IntervalUnit{ MonthDayNano = 2; } -message Decimal{ +message Decimal32Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal64Type { + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; +} + +message Decimal128Type { reserved 1, 2; uint32 precision = 3; int32 scale = 4; @@ -286,6 +298,8 @@ message ScalarValue{ ScalarNestedValue struct_value = 32; ScalarNestedValue map_value = 41; + Decimal32 decimal32_value = 43; + Decimal64 decimal64_value = 44; Decimal128 decimal128_value = 20; Decimal256 decimal256_value = 39; @@ -310,6 +324,18 @@ message ScalarValue{ } } +message Decimal32{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + +message Decimal64{ + bytes value = 1; + int64 p = 2; + int64 s = 3; +} + message Decimal128{ bytes value = 1; int64 p = 2; @@ -352,7 +378,9 @@ message ArrowType{ TimeUnit TIME32 = 21 ; TimeUnit TIME64 = 22 ; IntervalUnit INTERVAL = 23 ; - Decimal DECIMAL = 24 ; + Decimal32Type DECIMAL32 = 40; + Decimal64Type DECIMAL64 = 41; + Decimal128Type DECIMAL128 = 24; Decimal256Type DECIMAL256 = 36; List LIST = 25; List LARGE_LIST = 26; @@ -480,9 +508,7 @@ message ParquetColumnOptions { uint64 bloom_filter_ndv = 7; } - oneof max_statistics_size_opt { - uint32 max_statistics_size = 8; - } + reserved 8; // used to be uint32 max_statistics_size = 8; } message ParquetOptions { @@ -521,9 +547,7 @@ message ParquetOptions { string statistics_enabled = 13; } - oneof max_statistics_size_opt { - uint64 max_statistics_size = 14; - } + reserved 14; // used to be uint32 max_statistics_size = 20; oneof column_index_truncate_length_opt { uint64 column_index_truncate_length = 17; diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index 0823e150268d..c5242d0176e6 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -37,6 +37,7 @@ use datafusion_common::{ TableParquetOptions, }, file_options::{csv_writer::CsvWriterOptions, json_writer::JsonWriterOptions}, + not_impl_err, parsers::CompressionTypeVariant, plan_datafusion_err, stats::Precision, @@ -257,7 +258,15 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { arrow_type::ArrowTypeEnum::Interval(interval_unit) => { DataType::Interval(parse_i32_to_interval_unit(interval_unit)?) } - arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { + arrow_type::ArrowTypeEnum::Decimal32(protobuf::Decimal32Type { + precision, + scale, + }) => DataType::Decimal32(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal64(protobuf::Decimal64Type { + precision, + scale, + }) => DataType::Decimal64(*precision as u8, *scale as i8), + arrow_type::ArrowTypeEnum::Decimal128(protobuf::Decimal128Type { precision, scale, }) => DataType::Decimal128(*precision as u8, *scale as i8), @@ -469,6 +478,14 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { let null_type: DataType = v.try_into()?; null_type.try_into().map_err(Error::DataFusionError)? } + Value::Decimal32Value(_val) => { + return not_impl_err!("Decimal32 protobuf deserialization") + .map_err(Error::DataFusionError) + } + Value::Decimal64Value(_val) => { + return not_impl_err!("Decimal64 protobuf deserialization") + .map_err(Error::DataFusionError) + } Value::Decimal128Value(val) => { let array = vec_to_array(val.value.clone()); Self::Decimal128( @@ -938,12 +955,6 @@ impl TryFrom<&protobuf::ParquetOptions> for ParquetOptions { protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt.as_ref() - .map(|opt| match opt { - protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(*v as usize), - }) - .unwrap_or(None), max_row_group_size: value.max_row_group_size as usize, created_by: value.created_by.clone(), column_index_truncate_length: value @@ -1009,12 +1020,6 @@ impl TryFrom<&protobuf::ParquetColumnOptions> for ParquetColumnOptions { protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled(v) => Some(v), }) .unwrap_or(None), - max_statistics_size: value - .max_statistics_size_opt - .map(|opt| match opt { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => Some(v as usize), - }) - .unwrap_or(None), encoding: value .encoding_opt.clone() .map(|opt| match opt { diff --git a/datafusion/proto-common/src/generated/pbjson.rs b/datafusion/proto-common/src/generated/pbjson.rs index f35fd1594695..48782ff1d93a 100644 --- a/datafusion/proto-common/src/generated/pbjson.rs +++ b/datafusion/proto-common/src/generated/pbjson.rs @@ -243,8 +243,14 @@ impl serde::Serialize for ArrowType { .map_err(|_| serde::ser::Error::custom(format!("Invalid variant {}", *v)))?; struct_ser.serialize_field("INTERVAL", &v)?; } - arrow_type::ArrowTypeEnum::Decimal(v) => { - struct_ser.serialize_field("DECIMAL", v)?; + arrow_type::ArrowTypeEnum::Decimal32(v) => { + struct_ser.serialize_field("DECIMAL32", v)?; + } + arrow_type::ArrowTypeEnum::Decimal64(v) => { + struct_ser.serialize_field("DECIMAL64", v)?; + } + arrow_type::ArrowTypeEnum::Decimal128(v) => { + struct_ser.serialize_field("DECIMAL128", v)?; } arrow_type::ArrowTypeEnum::Decimal256(v) => { struct_ser.serialize_field("DECIMAL256", v)?; @@ -314,7 +320,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32", "TIME64", "INTERVAL", - "DECIMAL", + "DECIMAL32", + "DECIMAL64", + "DECIMAL128", "DECIMAL256", "LIST", "LARGE_LIST", @@ -356,7 +364,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { Time32, Time64, Interval, - Decimal, + Decimal32, + Decimal64, + Decimal128, Decimal256, List, LargeList, @@ -413,7 +423,9 @@ impl<'de> serde::Deserialize<'de> for ArrowType { "TIME32" => Ok(GeneratedField::Time32), "TIME64" => Ok(GeneratedField::Time64), "INTERVAL" => Ok(GeneratedField::Interval), - "DECIMAL" => Ok(GeneratedField::Decimal), + "DECIMAL32" => Ok(GeneratedField::Decimal32), + "DECIMAL64" => Ok(GeneratedField::Decimal64), + "DECIMAL128" => Ok(GeneratedField::Decimal128), "DECIMAL256" => Ok(GeneratedField::Decimal256), "LIST" => Ok(GeneratedField::List), "LARGELIST" | "LARGE_LIST" => Ok(GeneratedField::LargeList), @@ -628,11 +640,25 @@ impl<'de> serde::Deserialize<'de> for ArrowType { } arrow_type_enum__ = map_.next_value::<::std::option::Option>()?.map(|x| arrow_type::ArrowTypeEnum::Interval(x as i32)); } - GeneratedField::Decimal => { + GeneratedField::Decimal32 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL32")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal32) +; + } + GeneratedField::Decimal64 => { if arrow_type_enum__.is_some() { - return Err(serde::de::Error::duplicate_field("DECIMAL")); + return Err(serde::de::Error::duplicate_field("DECIMAL64")); } - arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal) + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal64) +; + } + GeneratedField::Decimal128 => { + if arrow_type_enum__.is_some() { + return Err(serde::de::Error::duplicate_field("DECIMAL128")); + } + arrow_type_enum__ = map_.next_value::<::std::option::Option<_>>()?.map(arrow_type::ArrowTypeEnum::Decimal128) ; } GeneratedField::Decimal256 => { @@ -2222,45 +2248,431 @@ impl<'de> serde::Deserialize<'de> for CsvWriterOptions { } null_value__ = Some(map_.next_value()?); } - GeneratedField::Quote => { - if quote__.is_some() { - return Err(serde::de::Error::duplicate_field("quote")); + GeneratedField::Quote => { + if quote__.is_some() { + return Err(serde::de::Error::duplicate_field("quote")); + } + quote__ = Some(map_.next_value()?); + } + GeneratedField::Escape => { + if escape__.is_some() { + return Err(serde::de::Error::duplicate_field("escape")); + } + escape__ = Some(map_.next_value()?); + } + GeneratedField::DoubleQuote => { + if double_quote__.is_some() { + return Err(serde::de::Error::duplicate_field("doubleQuote")); + } + double_quote__ = Some(map_.next_value()?); + } + } + } + Ok(CsvWriterOptions { + compression: compression__.unwrap_or_default(), + delimiter: delimiter__.unwrap_or_default(), + has_header: has_header__.unwrap_or_default(), + date_format: date_format__.unwrap_or_default(), + datetime_format: datetime_format__.unwrap_or_default(), + timestamp_format: timestamp_format__.unwrap_or_default(), + time_format: time_format__.unwrap_or_default(), + null_value: null_value__.unwrap_or_default(), + quote: quote__.unwrap_or_default(), + escape: escape__.unwrap_or_default(), + double_quote: double_quote__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128 { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128 { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); + } + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); + } + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); + } + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal128Type { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal128Type { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal128Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal128Type") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal128Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal128Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal256 { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.value.is_empty() { + len += 1; + } + if self.p != 0 { + len += 1; + } + if self.s != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + if !self.value.is_empty() { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("value", pbjson::private::base64::encode(&self.value).as_str())?; + } + if self.p != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("p", ToString::to_string(&self.p).as_str())?; + } + if self.s != 0 { + #[allow(clippy::needless_borrow)] + #[allow(clippy::needless_borrows_for_generic_args)] + struct_ser.serialize_field("s", ToString::to_string(&self.s).as_str())?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal256 { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "value", + "p", + "s", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Value, + P, + S, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "value" => Ok(GeneratedField::Value), + "p" => Ok(GeneratedField::P), + "s" => Ok(GeneratedField::S), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal256; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal256") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut value__ = None; + let mut p__ = None; + let mut s__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("value")); } - quote__ = Some(map_.next_value()?); + value__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; } - GeneratedField::Escape => { - if escape__.is_some() { - return Err(serde::de::Error::duplicate_field("escape")); + GeneratedField::P => { + if p__.is_some() { + return Err(serde::de::Error::duplicate_field("p")); } - escape__ = Some(map_.next_value()?); + p__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } - GeneratedField::DoubleQuote => { - if double_quote__.is_some() { - return Err(serde::de::Error::duplicate_field("doubleQuote")); + GeneratedField::S => { + if s__.is_some() { + return Err(serde::de::Error::duplicate_field("s")); } - double_quote__ = Some(map_.next_value()?); + s__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; } } } - Ok(CsvWriterOptions { - compression: compression__.unwrap_or_default(), - delimiter: delimiter__.unwrap_or_default(), - has_header: has_header__.unwrap_or_default(), - date_format: date_format__.unwrap_or_default(), - datetime_format: datetime_format__.unwrap_or_default(), - timestamp_format: timestamp_format__.unwrap_or_default(), - time_format: time_format__.unwrap_or_default(), - null_value: null_value__.unwrap_or_default(), - quote: quote__.unwrap_or_default(), - escape: escape__.unwrap_or_default(), - double_quote: double_quote__.unwrap_or_default(), + Ok(Decimal256 { + value: value__.unwrap_or_default(), + p: p__.unwrap_or_default(), + s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.CsvWriterOptions", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal { +impl serde::Serialize for Decimal256Type { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2274,7 +2686,7 @@ impl serde::Serialize for Decimal { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2284,7 +2696,7 @@ impl serde::Serialize for Decimal { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal { +impl<'de> serde::Deserialize<'de> for Decimal256Type { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2331,13 +2743,13 @@ impl<'de> serde::Deserialize<'de> for Decimal { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal; + type Value = Decimal256Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal") + formatter.write_str("struct datafusion_common.Decimal256Type") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2363,16 +2775,16 @@ impl<'de> serde::Deserialize<'de> for Decimal { } } } - Ok(Decimal { + Ok(Decimal256Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal128 { +impl serde::Serialize for Decimal32 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2389,7 +2801,7 @@ impl serde::Serialize for Decimal128 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal128", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2408,7 +2820,7 @@ impl serde::Serialize for Decimal128 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal128 { +impl<'de> serde::Deserialize<'de> for Decimal32 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2458,13 +2870,13 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal128; + type Value = Decimal32; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal128") + formatter.write_str("struct datafusion_common.Decimal32") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2499,17 +2911,129 @@ impl<'de> serde::Deserialize<'de> for Decimal128 { } } } - Ok(Decimal128 { + Ok(Decimal32 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal128", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal32", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256 { +impl serde::Serialize for Decimal32Type { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if self.precision != 0 { + len += 1; + } + if self.scale != 0 { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal32Type", len)?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; + } + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for Decimal32Type { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "precision", + "scale", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Precision, + Scale, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = Decimal32Type; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion_common.Decimal32Type") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut precision__ = None; + let mut scale__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); + } + precision__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); + } + scale__ = + Some(map_.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) + ; + } + } + } + Ok(Decimal32Type { + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion_common.Decimal32Type", FIELDS, GeneratedVisitor) + } +} +impl serde::Serialize for Decimal64 { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2526,7 +3050,7 @@ impl serde::Serialize for Decimal256 { if self.s != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64", len)?; if !self.value.is_empty() { #[allow(clippy::needless_borrow)] #[allow(clippy::needless_borrows_for_generic_args)] @@ -2545,7 +3069,7 @@ impl serde::Serialize for Decimal256 { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256 { +impl<'de> serde::Deserialize<'de> for Decimal64 { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2595,13 +3119,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256; + type Value = Decimal64; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256") + formatter.write_str("struct datafusion_common.Decimal64") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2636,17 +3160,17 @@ impl<'de> serde::Deserialize<'de> for Decimal256 { } } } - Ok(Decimal256 { + Ok(Decimal64 { value: value__.unwrap_or_default(), p: p__.unwrap_or_default(), s: s__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64", FIELDS, GeneratedVisitor) } } -impl serde::Serialize for Decimal256Type { +impl serde::Serialize for Decimal64Type { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result where @@ -2660,7 +3184,7 @@ impl serde::Serialize for Decimal256Type { if self.scale != 0 { len += 1; } - let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal256Type", len)?; + let mut struct_ser = serializer.serialize_struct("datafusion_common.Decimal64Type", len)?; if self.precision != 0 { struct_ser.serialize_field("precision", &self.precision)?; } @@ -2670,7 +3194,7 @@ impl serde::Serialize for Decimal256Type { struct_ser.end() } } -impl<'de> serde::Deserialize<'de> for Decimal256Type { +impl<'de> serde::Deserialize<'de> for Decimal64Type { #[allow(deprecated)] fn deserialize(deserializer: D) -> std::result::Result where @@ -2717,13 +3241,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } struct GeneratedVisitor; impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { - type Value = Decimal256Type; + type Value = Decimal64Type; fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - formatter.write_str("struct datafusion_common.Decimal256Type") + formatter.write_str("struct datafusion_common.Decimal64Type") } - fn visit_map(self, mut map_: V) -> std::result::Result + fn visit_map(self, mut map_: V) -> std::result::Result where V: serde::de::MapAccess<'de>, { @@ -2749,13 +3273,13 @@ impl<'de> serde::Deserialize<'de> for Decimal256Type { } } } - Ok(Decimal256Type { + Ok(Decimal64Type { precision: precision__.unwrap_or_default(), scale: scale__.unwrap_or_default(), }) } } - deserializer.deserialize_struct("datafusion_common.Decimal256Type", FIELDS, GeneratedVisitor) + deserializer.deserialize_struct("datafusion_common.Decimal64Type", FIELDS, GeneratedVisitor) } } impl serde::Serialize for DfField { @@ -4589,9 +5113,6 @@ impl serde::Serialize for ParquetColumnOptions { if self.bloom_filter_ndv_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } let mut struct_ser = serializer.serialize_struct("datafusion_common.ParquetColumnOptions", len)?; if let Some(v) = self.bloom_filter_enabled_opt.as_ref() { match v { @@ -4644,13 +5165,6 @@ impl serde::Serialize for ParquetColumnOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - struct_ser.serialize_field("maxStatisticsSize", v)?; - } - } - } struct_ser.end() } } @@ -4673,8 +5187,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "bloomFilterFpp", "bloom_filter_ndv", "bloomFilterNdv", - "max_statistics_size", - "maxStatisticsSize", ]; #[allow(clippy::enum_variant_names)] @@ -4686,7 +5198,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { StatisticsEnabled, BloomFilterFpp, BloomFilterNdv, - MaxStatisticsSize, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -4715,7 +5226,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), "bloomFilterFpp" | "bloom_filter_fpp" => Ok(GeneratedField::BloomFilterFpp), "bloomFilterNdv" | "bloom_filter_ndv" => Ok(GeneratedField::BloomFilterNdv), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -4742,7 +5252,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { let mut statistics_enabled_opt__ = None; let mut bloom_filter_fpp_opt__ = None; let mut bloom_filter_ndv_opt__ = None; - let mut max_statistics_size_opt__ = None; while let Some(k) = map_.next_key()? { match k { GeneratedField::BloomFilterEnabled => { @@ -4787,12 +5296,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { } bloom_filter_ndv_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(x.0)); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } } } Ok(ParquetColumnOptions { @@ -4803,7 +5306,6 @@ impl<'de> serde::Deserialize<'de> for ParquetColumnOptions { statistics_enabled_opt: statistics_enabled_opt__, bloom_filter_fpp_opt: bloom_filter_fpp_opt__, bloom_filter_ndv_opt: bloom_filter_ndv_opt__, - max_statistics_size_opt: max_statistics_size_opt__, }) } } @@ -5090,9 +5592,6 @@ impl serde::Serialize for ParquetOptions { if self.statistics_enabled_opt.is_some() { len += 1; } - if self.max_statistics_size_opt.is_some() { - len += 1; - } if self.column_index_truncate_length_opt.is_some() { len += 1; } @@ -5216,15 +5715,6 @@ impl serde::Serialize for ParquetOptions { } } } - if let Some(v) = self.max_statistics_size_opt.as_ref() { - match v { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v) => { - #[allow(clippy::needless_borrow)] - #[allow(clippy::needless_borrows_for_generic_args)] - struct_ser.serialize_field("maxStatisticsSize", ToString::to_string(&v).as_str())?; - } - } - } if let Some(v) = self.column_index_truncate_length_opt.as_ref() { match v { parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v) => { @@ -5329,8 +5819,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "dictionaryEnabled", "statistics_enabled", "statisticsEnabled", - "max_statistics_size", - "maxStatisticsSize", "column_index_truncate_length", "columnIndexTruncateLength", "statistics_truncate_length", @@ -5370,7 +5858,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { Compression, DictionaryEnabled, StatisticsEnabled, - MaxStatisticsSize, ColumnIndexTruncateLength, StatisticsTruncateLength, Encoding, @@ -5422,7 +5909,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { "compression" => Ok(GeneratedField::Compression), "dictionaryEnabled" | "dictionary_enabled" => Ok(GeneratedField::DictionaryEnabled), "statisticsEnabled" | "statistics_enabled" => Ok(GeneratedField::StatisticsEnabled), - "maxStatisticsSize" | "max_statistics_size" => Ok(GeneratedField::MaxStatisticsSize), "columnIndexTruncateLength" | "column_index_truncate_length" => Ok(GeneratedField::ColumnIndexTruncateLength), "statisticsTruncateLength" | "statistics_truncate_length" => Ok(GeneratedField::StatisticsTruncateLength), "encoding" => Ok(GeneratedField::Encoding), @@ -5472,7 +5958,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { let mut compression_opt__ = None; let mut dictionary_enabled_opt__ = None; let mut statistics_enabled_opt__ = None; - let mut max_statistics_size_opt__ = None; let mut column_index_truncate_length_opt__ = None; let mut statistics_truncate_length_opt__ = None; let mut encoding_opt__ = None; @@ -5639,12 +6124,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { } statistics_enabled_opt__ = map_.next_value::<::std::option::Option<_>>()?.map(parquet_options::StatisticsEnabledOpt::StatisticsEnabled); } - GeneratedField::MaxStatisticsSize => { - if max_statistics_size_opt__.is_some() { - return Err(serde::de::Error::duplicate_field("maxStatisticsSize")); - } - max_statistics_size_opt__ = map_.next_value::<::std::option::Option<::pbjson::private::NumberDeserialize<_>>>()?.map(|x| parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(x.0)); - } GeneratedField::ColumnIndexTruncateLength => { if column_index_truncate_length_opt__.is_some() { return Err(serde::de::Error::duplicate_field("columnIndexTruncateLength")); @@ -5708,7 +6187,6 @@ impl<'de> serde::Deserialize<'de> for ParquetOptions { compression_opt: compression_opt__, dictionary_enabled_opt: dictionary_enabled_opt__, statistics_enabled_opt: statistics_enabled_opt__, - max_statistics_size_opt: max_statistics_size_opt__, column_index_truncate_length_opt: column_index_truncate_length_opt__, statistics_truncate_length_opt: statistics_truncate_length_opt__, encoding_opt: encoding_opt__, @@ -6959,6 +7437,12 @@ impl serde::Serialize for ScalarValue { scalar_value::Value::MapValue(v) => { struct_ser.serialize_field("mapValue", v)?; } + scalar_value::Value::Decimal32Value(v) => { + struct_ser.serialize_field("decimal32Value", v)?; + } + scalar_value::Value::Decimal64Value(v) => { + struct_ser.serialize_field("decimal64Value", v)?; + } scalar_value::Value::Decimal128Value(v) => { struct_ser.serialize_field("decimal128Value", v)?; } @@ -7085,6 +7569,10 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "structValue", "map_value", "mapValue", + "decimal32_value", + "decimal32Value", + "decimal64_value", + "decimal64Value", "decimal128_value", "decimal128Value", "decimal256_value", @@ -7147,6 +7635,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { FixedSizeListValue, StructValue, MapValue, + Decimal32Value, + Decimal64Value, Decimal128Value, Decimal256Value, Date64Value, @@ -7208,6 +7698,8 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { "fixedSizeListValue" | "fixed_size_list_value" => Ok(GeneratedField::FixedSizeListValue), "structValue" | "struct_value" => Ok(GeneratedField::StructValue), "mapValue" | "map_value" => Ok(GeneratedField::MapValue), + "decimal32Value" | "decimal32_value" => Ok(GeneratedField::Decimal32Value), + "decimal64Value" | "decimal64_value" => Ok(GeneratedField::Decimal64Value), "decimal128Value" | "decimal128_value" => Ok(GeneratedField::Decimal128Value), "decimal256Value" | "decimal256_value" => Ok(GeneratedField::Decimal256Value), "date64Value" | "date_64_value" => Ok(GeneratedField::Date64Value), @@ -7385,6 +7877,20 @@ impl<'de> serde::Deserialize<'de> for ScalarValue { return Err(serde::de::Error::duplicate_field("mapValue")); } value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::MapValue) +; + } + GeneratedField::Decimal32Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal32Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal32Value) +; + } + GeneratedField::Decimal64Value => { + if value__.is_some() { + return Err(serde::de::Error::duplicate_field("decimal64Value")); + } + value__ = map_.next_value::<::std::option::Option<_>>()?.map(scalar_value::Value::Decimal64Value) ; } GeneratedField::Decimal128Value => { diff --git a/datafusion/proto-common/src/generated/prost.rs b/datafusion/proto-common/src/generated/prost.rs index ac4a9ea4be69..aa23cea57470 100644 --- a/datafusion/proto-common/src/generated/prost.rs +++ b/datafusion/proto-common/src/generated/prost.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -786,10 +817,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -830,11 +857,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index b6cbe5759cfc..c06427065733 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -189,7 +189,15 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { value: Some(Box::new(value_type.as_ref().try_into()?)), })) } - DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal { + DataType::Decimal32(precision, scale) => Self::Decimal32(protobuf::Decimal32Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal64(precision, scale) => Self::Decimal64(protobuf::Decimal64Type { + precision: *precision as u32, + scale: *scale as i32, + }), + DataType::Decimal128(precision, scale) => Self::Decimal128(protobuf::Decimal128Type { precision: *precision as u32, scale: *scale as i32, }), @@ -817,8 +825,6 @@ impl TryFrom<&ParquetOptions> for protobuf::ParquetOptions { dictionary_enabled_opt: value.dictionary_enabled.map(protobuf::parquet_options::DictionaryEnabledOpt::DictionaryEnabled), dictionary_page_size_limit: value.dictionary_page_size_limit as u64, statistics_enabled_opt: value.statistics_enabled.clone().map(protobuf::parquet_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| protobuf::parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v as u64)), max_row_group_size: value.max_row_group_size as u64, created_by: value.created_by.clone(), column_index_truncate_length_opt: value.column_index_truncate_length.map(|v| protobuf::parquet_options::ColumnIndexTruncateLengthOpt::ColumnIndexTruncateLength(v as u64)), @@ -858,12 +864,6 @@ impl TryFrom<&ParquetColumnOptions> for protobuf::ParquetColumnOptions { .statistics_enabled .clone() .map(protobuf::parquet_column_options::StatisticsEnabledOpt::StatisticsEnabled), - #[allow(deprecated)] - max_statistics_size_opt: value.max_statistics_size.map(|v| { - protobuf::parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize( - v as u32, - ) - }), encoding_opt: value .encoding .clone() diff --git a/datafusion/proto/src/generated/datafusion_proto_common.rs b/datafusion/proto/src/generated/datafusion_proto_common.rs index ac4a9ea4be69..aa23cea57470 100644 --- a/datafusion/proto/src/generated/datafusion_proto_common.rs +++ b/datafusion/proto/src/generated/datafusion_proto_common.rs @@ -117,7 +117,21 @@ pub struct Timestamp { pub timezone: ::prost::alloc::string::String, } #[derive(Clone, Copy, PartialEq, ::prost::Message)] -pub struct Decimal { +pub struct Decimal32Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal64Type { + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, +} +#[derive(Clone, Copy, PartialEq, ::prost::Message)] +pub struct Decimal128Type { #[prost(uint32, tag = "3")] pub precision: u32, #[prost(int32, tag = "4")] @@ -297,7 +311,7 @@ pub struct ScalarFixedSizeBinary { pub struct ScalarValue { #[prost( oneof = "scalar_value::Value", - tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" + tags = "33, 1, 2, 3, 23, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 32, 41, 43, 44, 20, 39, 21, 24, 35, 36, 37, 38, 26, 27, 28, 29, 22, 30, 25, 31, 34, 42" )] pub value: ::core::option::Option, } @@ -352,6 +366,10 @@ pub mod scalar_value { StructValue(super::ScalarNestedValue), #[prost(message, tag = "41")] MapValue(super::ScalarNestedValue), + #[prost(message, tag = "43")] + Decimal32Value(super::Decimal32), + #[prost(message, tag = "44")] + Decimal64Value(super::Decimal64), #[prost(message, tag = "20")] Decimal128Value(super::Decimal128), #[prost(message, tag = "39")] @@ -391,6 +409,24 @@ pub mod scalar_value { } } #[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal32 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] +pub struct Decimal64 { + #[prost(bytes = "vec", tag = "1")] + pub value: ::prost::alloc::vec::Vec, + #[prost(int64, tag = "2")] + pub p: i64, + #[prost(int64, tag = "3")] + pub s: i64, +} +#[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal128 { #[prost(bytes = "vec", tag = "1")] pub value: ::prost::alloc::vec::Vec, @@ -413,7 +449,7 @@ pub struct Decimal256 { pub struct ArrowType { #[prost( oneof = "arrow_type::ArrowTypeEnum", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 24, 36, 25, 26, 27, 28, 29, 30, 33" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 35, 32, 15, 34, 16, 31, 17, 18, 19, 20, 21, 22, 23, 40, 41, 24, 36, 25, 26, 27, 28, 29, 30, 33" )] pub arrow_type_enum: ::core::option::Option, } @@ -480,8 +516,12 @@ pub mod arrow_type { Time64(i32), #[prost(enumeration = "super::IntervalUnit", tag = "23")] Interval(i32), + #[prost(message, tag = "40")] + Decimal32(super::Decimal32Type), + #[prost(message, tag = "41")] + Decimal64(super::Decimal64Type), #[prost(message, tag = "24")] - Decimal(super::Decimal), + Decimal128(super::Decimal128Type), #[prost(message, tag = "36")] Decimal256(super::Decimal256Type), #[prost(message, tag = "25")] @@ -662,10 +702,6 @@ pub struct ParquetColumnOptions { pub bloom_filter_ndv_opt: ::core::option::Option< parquet_column_options::BloomFilterNdvOpt, >, - #[prost(oneof = "parquet_column_options::MaxStatisticsSizeOpt", tags = "8")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_column_options::MaxStatisticsSizeOpt, - >, } /// Nested message and enum types in `ParquetColumnOptions`. pub mod parquet_column_options { @@ -704,11 +740,6 @@ pub mod parquet_column_options { #[prost(uint64, tag = "7")] BloomFilterNdv(u64), } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint32, tag = "8")] - MaxStatisticsSize(u32), - } } #[derive(Clone, PartialEq, ::prost::Message)] pub struct ParquetOptions { @@ -786,10 +817,6 @@ pub struct ParquetOptions { pub statistics_enabled_opt: ::core::option::Option< parquet_options::StatisticsEnabledOpt, >, - #[prost(oneof = "parquet_options::MaxStatisticsSizeOpt", tags = "14")] - pub max_statistics_size_opt: ::core::option::Option< - parquet_options::MaxStatisticsSizeOpt, - >, #[prost(oneof = "parquet_options::ColumnIndexTruncateLengthOpt", tags = "17")] pub column_index_truncate_length_opt: ::core::option::Option< parquet_options::ColumnIndexTruncateLengthOpt, @@ -830,11 +857,6 @@ pub mod parquet_options { StatisticsEnabled(::prost::alloc::string::String), } #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] - pub enum MaxStatisticsSizeOpt { - #[prost(uint64, tag = "14")] - MaxStatisticsSize(u64), - } - #[derive(Clone, Copy, PartialEq, ::prost::Oneof)] pub enum ColumnIndexTruncateLengthOpt { #[prost(uint64, tag = "17")] ColumnIndexTruncateLength(u64), diff --git a/datafusion/proto/src/logical_plan/file_formats.rs b/datafusion/proto/src/logical_plan/file_formats.rs index 620442c79e72..654607bd733d 100644 --- a/datafusion/proto/src/logical_plan/file_formats.rs +++ b/datafusion/proto/src/logical_plan/file_formats.rs @@ -382,9 +382,6 @@ impl TableParquetOptionsProto { statistics_enabled_opt: global_options.global.statistics_enabled.map(|enabled| { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(enabled) }), - max_statistics_size_opt: global_options.global.max_statistics_size.map(|size| { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u64) - }), max_row_group_size: global_options.global.max_row_group_size as u64, created_by: global_options.global.created_by.clone(), column_index_truncate_length_opt: global_options.global.column_index_truncate_length.map(|length| { @@ -440,9 +437,6 @@ impl TableParquetOptionsProto { bloom_filter_ndv_opt: options.bloom_filter_ndv.map(|ndv| { parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(ndv) }), - max_statistics_size_opt: options.max_statistics_size.map(|size| { - parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size as u32) - }), }) } }).collect(), @@ -481,9 +475,6 @@ impl From<&ParquetOptionsProto> for ParquetOptions { statistics_enabled: proto.statistics_enabled_opt.as_ref().map(|opt| match opt { parquet_options::StatisticsEnabledOpt::StatisticsEnabled(statistics) => statistics.clone(), }), - max_statistics_size: proto.max_statistics_size_opt.as_ref().map(|opt| match opt { - parquet_options::MaxStatisticsSizeOpt::MaxStatisticsSize(size) => *size as usize, - }), max_row_group_size: proto.max_row_group_size as usize, created_by: proto.created_by.clone(), column_index_truncate_length: proto.column_index_truncate_length_opt.as_ref().map(|opt| match opt { @@ -542,11 +533,6 @@ impl From for ParquetColumnOptions { bloom_filter_ndv: proto .bloom_filter_ndv_opt .map(|parquet_column_options::BloomFilterNdvOpt::BloomFilterNdv(v)| v), - max_statistics_size: proto.max_statistics_size_opt.map( - |parquet_column_options::MaxStatisticsSizeOpt::MaxStatisticsSize(v)| { - v as usize - }, - ), } } } diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index c4baf9b438e9..d4e911a62c09 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1726,6 +1726,12 @@ impl Unparser<'_> { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } DataType::Dictionary(_, val) => self.arrow_dtype_to_ast_dtype(val), + DataType::Decimal32(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } + DataType::Decimal64(_precision, _scale) => { + not_impl_err!("Unsupported DataType: conversion: {data_type:?}") + } DataType::Decimal128(precision, scale) | DataType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; diff --git a/datafusion/sqllogictest/test_files/array.slt b/datafusion/sqllogictest/test_files/array.slt index 9ec3ff8409c2..96ab84ab9095 100644 --- a/datafusion/sqllogictest/test_files/array.slt +++ b/datafusion/sqllogictest/test_files/array.slt @@ -8062,7 +8062,19 @@ select arrow_typeof(a) from fixed_size_col_table; FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) FixedSizeList(Field { name: "item", data_type: Int32, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3) -statement error +query ? rowsort +SELECT DISTINCT a FROM fixed_size_col_table +---- +[1, 2, 3] +[4, 5, 6] + +query ?I rowsort +SELECT a, count(*) FROM fixed_size_col_table GROUP BY a +---- +[1, 2, 3] 1 +[4, 5, 6] 1 + +statement error Cast error: Cannot cast to FixedSizeList\(3\): value at index 0 has length 2 create table varying_fixed_size_col_table (a int[3]) as values ([1,2,3]), ([4,5]); # https://github.com/apache/datafusion/issues/16187 diff --git a/datafusion/sqllogictest/test_files/copy.slt b/datafusion/sqllogictest/test_files/copy.slt index 5eeb05e814ac..096cde86f26f 100644 --- a/datafusion/sqllogictest/test_files/copy.slt +++ b/datafusion/sqllogictest/test_files/copy.slt @@ -332,7 +332,6 @@ OPTIONS ( 'format.dictionary_enabled' false, 'format.statistics_enabled' page, 'format.statistics_enabled::col2' none, -'format.max_statistics_size' 123, 'format.bloom_filter_fpp' 0.001, 'format.bloom_filter_ndv' 100, 'format.metadata::key' 'value' diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 8eb1fa9a0086..fb2c89020112 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -242,7 +242,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 datafusion.execution.parquet.enable_page_index true datafusion.execution.parquet.encoding NULL datafusion.execution.parquet.max_row_group_size 1048576 -datafusion.execution.parquet.max_statistics_size 4096 datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 datafusion.execution.parquet.maximum_parallel_row_group_writers 1 datafusion.execution.parquet.metadata_size_hint NULL @@ -253,7 +252,7 @@ datafusion.execution.parquet.schema_force_view_types true datafusion.execution.parquet.skip_arrow_metadata false datafusion.execution.parquet.skip_metadata true datafusion.execution.parquet.statistics_enabled page -datafusion.execution.parquet.statistics_truncate_length NULL +datafusion.execution.parquet.statistics_truncate_length 64 datafusion.execution.parquet.write_batch_size 1024 datafusion.execution.parquet.writer_version 1.0 datafusion.execution.planning_concurrency 13 @@ -357,7 +356,6 @@ datafusion.execution.parquet.dictionary_page_size_limit 1048576 (writing) Sets b datafusion.execution.parquet.enable_page_index true (reading) If true, reads the Parquet data page level metadata (the Page Index), if present, to reduce the I/O and number of rows decoded. datafusion.execution.parquet.encoding NULL (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting datafusion.execution.parquet.max_row_group_size 1048576 (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. -datafusion.execution.parquet.max_statistics_size 4096 (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used datafusion.execution.parquet.maximum_buffered_record_batches_per_stream 2 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.maximum_parallel_row_group_writers 1 (writing) By default parallel parquet writer is tuned for minimum memory usage in a streaming execution plan. You may see a performance benefit when writing large parquet files by increasing maximum_parallel_row_group_writers and maximum_buffered_record_batches_per_stream if your system has idle cores and can tolerate additional memory usage. Boosting these values is likely worthwhile when writing out already in-memory data, such as from a cached data frame. datafusion.execution.parquet.metadata_size_hint NULL (reading) If specified, the parquet reader will try and fetch the last `size_hint` bytes of the parquet file optimistically. If not specified, two reads are required: One read to fetch the 8-byte parquet footer and another to fetch the metadata length encoded in the footer @@ -368,7 +366,7 @@ datafusion.execution.parquet.schema_force_view_types true (reading) If true, par datafusion.execution.parquet.skip_arrow_metadata false (writing) Skip encoding the embedded arrow metadata in the KV_meta This is analogous to the `ArrowWriterOptions::with_skip_arrow_metadata`. Refer to datafusion.execution.parquet.skip_metadata true (reading) If true, the parquet reader skip the optional embedded metadata that may be in the file Schema. This setting can help avoid schema conflicts when querying multiple parquet files with schemas containing compatible types but different metadata datafusion.execution.parquet.statistics_enabled page (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting -datafusion.execution.parquet.statistics_truncate_length NULL (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting +datafusion.execution.parquet.statistics_truncate_length 64 (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting datafusion.execution.parquet.write_batch_size 1024 (writing) Sets write_batch_size in bytes datafusion.execution.parquet.writer_version 1.0 (writing) Sets parquet writer version valid values are "1.0" and "2.0" datafusion.execution.planning_concurrency 13 Fan-out during initial physical planning. This is mostly use to plan `UNION` children in parallel. Defaults to the number of CPU cores on the system diff --git a/datafusion/sqllogictest/test_files/listing_table_statistics.slt b/datafusion/sqllogictest/test_files/listing_table_statistics.slt index a43d4be62551..37daf551c2c3 100644 --- a/datafusion/sqllogictest/test_files/listing_table_statistics.slt +++ b/datafusion/sqllogictest/test_files/listing_table_statistics.slt @@ -35,7 +35,7 @@ query TT explain format indent select * from t; ---- logical_plan TableScan: t projection=[int_col, str_col] -physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(288), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] +physical_plan DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/1.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/listing_table_statistics/2.parquet]]}, projection=[int_col, str_col], file_type=parquet, statistics=[Rows=Exact(4), Bytes=Exact(212), [(Col[0]: Min=Exact(Int64(-1)) Max=Exact(Int64(3)) Null=Exact(0)),(Col[1]: Min=Exact(Utf8View("a")) Max=Exact(Utf8View("d")) Null=Exact(0))]] statement ok drop table t; diff --git a/datafusion/sqllogictest/test_files/parquet_statistics.slt b/datafusion/sqllogictest/test_files/parquet_statistics.slt index 2b70f4f9c00a..f9e89902998c 100644 --- a/datafusion/sqllogictest/test_files/parquet_statistics.slt +++ b/datafusion/sqllogictest/test_files/parquet_statistics.slt @@ -59,10 +59,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok @@ -85,10 +85,10 @@ query TT EXPLAIN SELECT * FROM test_table WHERE column1 = 1; ---- physical_plan -01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(44), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] -03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] -04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(173), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +01)CoalesceBatchesExec: target_batch_size=8192, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +02)--FilterExec: column1@0 = 1, statistics=[Rows=Inexact(2), Bytes=Inexact(31), [(Col[0]: Min=Exact(Int64(1)) Max=Exact(Int64(1)) Null=Inexact(0))]] +03)----RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=2, statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] +04)------DataSourceExec: file_groups={2 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/0.parquet], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/parquet_statistics/test_table/1.parquet]]}, projection=[column1], file_type=parquet, predicate=column1@0 = 1, pruning_predicate=column1_null_count@2 != row_count@3 AND column1_min@0 <= 1 AND 1 <= column1_max@1, required_guarantees=[column1 in (1)], statistics=[Rows=Inexact(5), Bytes=Inexact(121), [(Col[0]: Min=Inexact(Int64(1)) Max=Inexact(Int64(4)) Null=Inexact(0))]] # cleanup statement ok diff --git a/datafusion/sqllogictest/test_files/repartition_scan.slt b/datafusion/sqllogictest/test_files/repartition_scan.slt index 0b851f917855..c536c8165c5a 100644 --- a/datafusion/sqllogictest/test_files/repartition_scan.slt +++ b/datafusion/sqllogictest/test_files/repartition_scan.slt @@ -61,7 +61,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # disable round robin repartitioning statement ok @@ -77,7 +77,7 @@ logical_plan physical_plan 01)CoalesceBatchesExec: target_batch_size=8192 02)--FilterExec: column1@0 != 42 -03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..141], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:141..282], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:282..423], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:423..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +03)----DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..135], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:135..270], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:270..405], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:405..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # enable round robin repartitioning again statement ok @@ -102,7 +102,7 @@ physical_plan 02)--SortExec: expr=[column1@0 ASC NULLS LAST], preserve_partitioning=[true] 03)----CoalesceBatchesExec: target_batch_size=8192 04)------FilterExec: column1@0 != 42 -05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..280], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:280..554, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..286], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:286..563]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +05)--------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..266], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:266..526, WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..6], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:6..272], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:272..537]]}, projection=[column1], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] ## Read the files as though they are ordered @@ -138,7 +138,7 @@ physical_plan 01)SortPreservingMergeExec: [column1@0 ASC NULLS LAST] 02)--CoalesceBatchesExec: target_batch_size=8192 03)----FilterExec: column1@0 != 42 -04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..277], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..281], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:281..563], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:277..554]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] +04)------DataSourceExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:0..263], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:0..268], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/2.parquet:268..537], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/scratch/repartition_scan/parquet_table/1.parquet:263..526]]}, projection=[column1], output_ordering=[column1@0 ASC NULLS LAST], file_type=parquet, predicate=column1@0 != 42, pruning_predicate=column1_null_count@2 != row_count@3 AND (column1_min@0 != 42 OR 42 != column1_max@1), required_guarantees=[column1 not in (42)] # Cleanup statement ok diff --git a/datafusion/substrait/src/logical_plan/consumer/utils.rs b/datafusion/substrait/src/logical_plan/consumer/utils.rs index 67215e8e343e..f7eedcb7a2b2 100644 --- a/datafusion/substrait/src/logical_plan/consumer/utils.rs +++ b/datafusion/substrait/src/logical_plan/consumer/utils.rs @@ -213,6 +213,8 @@ pub fn rename_data_type( | DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View + | DataType::Decimal32(_, _) + | DataType::Decimal64(_, _) | DataType::Decimal128(_, _) | DataType::Decimal256(_, _) => Ok(data_type.clone()), } diff --git a/docs/source/library-user-guide/upgrading.md b/docs/source/library-user-guide/upgrading.md index a9c39b086118..828beec8545d 100644 --- a/docs/source/library-user-guide/upgrading.md +++ b/docs/source/library-user-guide/upgrading.md @@ -212,6 +212,12 @@ use datafusion_physical_expr_adapter::{ }; ``` +### Upgrade to arrow `56.0.0` and parquet `56.0.0` + +This version of DataFusion upgrades the underlying Apache Arrow implementation +to version `56.0.0`. See the [release notes](https://github.com/apache/arrow-rs/releases/tag/56.0.0) +for more details. + ### Added `ExecutionPlan::reset_state` In order to fix a bug in DataFusion `49.0.0` where dynamic filters (currently only generated in the presence of a query such as `ORDER BY ... LIMIT ...`) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index cdc6d615e37f..3d4730958fb3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -96,11 +96,10 @@ The following configuration settings are available: | datafusion.execution.parquet.dictionary_enabled | true | (writing) Sets if dictionary encoding is enabled. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.dictionary_page_size_limit | 1048576 | (writing) Sets best effort maximum dictionary page size, in bytes | | datafusion.execution.parquet.statistics_enabled | page | (writing) Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | -| datafusion.execution.parquet.max_statistics_size | 4096 | (writing) Sets max statistics size for any column. If NULL, uses default parquet writer setting max_statistics_size is deprecated, currently it is not being used | | datafusion.execution.parquet.max_row_group_size | 1048576 | (writing) Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | | datafusion.execution.parquet.created_by | datafusion version 50.0.0 | (writing) Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | 64 | (writing) Sets column index truncate length | -| datafusion.execution.parquet.statistics_truncate_length | NULL | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | +| datafusion.execution.parquet.statistics_truncate_length | 64 | (writing) Sets statistics truncate length. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.data_page_row_count_limit | 20000 | (writing) Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | (writing) Sets default encoding for any column. Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.bloom_filter_on_write | false | (writing) Write bloom filters for all columns when creating parquet files | diff --git a/parquet-testing b/parquet-testing index a3d96a65e11e..107b36603e05 160000 --- a/parquet-testing +++ b/parquet-testing @@ -1 +1 @@ -Subproject commit a3d96a65e11e2bbca7d22a894e8313ede90a33a3 +Subproject commit 107b36603e051aee26bd93e04b871034f6c756c0 diff --git a/testing b/testing index 0d60ccae40d0..d2a137123034 160000 --- a/testing +++ b/testing @@ -1 +1 @@ -Subproject commit 0d60ccae40d0e8f2d22c15fafb01c5d4be8c63a6 +Subproject commit d2a13712303498963395318a4eb42872e66aead7 diff --git a/typos.toml b/typos.toml index 5741ab8e9cc9..46f21febcf86 100644 --- a/typos.toml +++ b/typos.toml @@ -15,7 +15,6 @@ aroun = "aroun" abov = "abov" Ois = "Ois" alo = "alo" -Byt = "Byt" # abbreviations, common words, etc. typ = "typ" From 5271ce8b75f4013a486096fe8e0e2e12853b44be Mon Sep 17 00:00:00 2001 From: LiaCastaneda Date: Wed, 15 Oct 2025 11:40:45 +0200 Subject: [PATCH 2/2] Allow typo --- typos.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/typos.toml b/typos.toml index 46f21febcf86..1b1af4be339e 100644 --- a/typos.toml +++ b/typos.toml @@ -21,6 +21,7 @@ typ = "typ" datas = "datas" YOUY = "YOUY" lits = "lits" +nteger = "nteger" # exposed to public API Serializeable = "Serializeable"