From 5bc482e707cf5d2a2eee7246e466519ba3e5fbeb Mon Sep 17 00:00:00 2001 From: Eric Fredine Date: Tue, 8 Oct 2024 09:42:33 -0700 Subject: [PATCH 1/5] Adds documentation and example recommending Vec as an alternative to a ChunkedArray abstraction." --- arrow/examples/README.md | 3 +- arrow/examples/chunked_arrays.rs | 106 +++++++++++++++++++++++++++++++ arrow/src/lib.rs | 63 ++++++++++++++++++ 3 files changed, 171 insertions(+), 1 deletion(-) create mode 100644 arrow/examples/chunked_arrays.rs diff --git a/arrow/examples/README.md b/arrow/examples/README.md index 5c57ec00cd76..f33252355c0a 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -20,8 +20,9 @@ # Examples - [`builders.rs`](builders.rs): Using the Builder API +- [`chunked_arrays.rs`](chunked_arrays.rs): Using Vec to represent chunked arrays - [`collect.rs`](collect.rs): Using the `FromIter` API -- [`dynamic_types.rs`](dynamic_types.rs): +- [`dynamic_types.rs`](dynamic_types.rs): Dealing with mixed types dynamically at runtime - [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty printing Arrays - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, pretty printing Arrays - [`tensor_builder.rs`](tensor_builder.rs): Using tensor builder diff --git a/arrow/examples/chunked_arrays.rs b/arrow/examples/chunked_arrays.rs new file mode 100644 index 000000000000..59941466a55c --- /dev/null +++ b/arrow/examples/chunked_arrays.rs @@ -0,0 +1,106 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! This example demonstrates using Vec as an alternative to ChunkedArray. +use arrow::array::{ArrayRef, AsArray, Float32Array, StringArray}; +use arrow::record_batch::RecordBatch; +use arrow_array::cast::as_string_array; +use arrow_array::types::Float32Type; +use std::sync::Arc; + +fn main() { + let batches = [ + RecordBatch::try_from_iter(vec![ + ( + "label", + Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef, + ), + ( + "value", + Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef, + ), + ]) + .unwrap(), + RecordBatch::try_from_iter(vec![ + ( + "label", + Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef, + ), + ( + "value", + Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef, + ), + ]) + .unwrap(), + ]; + + // chunked_array_by_index is an array of two Vec where each Vec is a column + let mut chunked_array_by_index = [Vec::new(), Vec::new()]; + for batch in &batches { + for (i, array) in batch.columns().iter().enumerate() { + chunked_array_by_index[i].push(array.clone()); + } + } + + // downcast and iterate over the values - column 0 is the labels and column 1 is the values + let labels: Vec<&str> = chunked_array_by_index[0] + .iter() + .flat_map(|x| as_string_array(x).iter()) + .flatten() // flatten the Option to String + .collect(); + + let values: Vec = chunked_array_by_index[1] + .iter() + .flat_map(|x| x.as_primitive::().iter()) + .flatten() // flatten the Option to f32 + .collect(); + + assert_eq!(labels, ["A", "B", "C", "D", "E"]); + assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); + + // Or you could use a struct with typed chunks and downcast as you gather them + type ChunkedStringArray = Vec; + type ChunkedFloat32Array = Vec; + + #[derive(Default)] + struct MyTable { + labels: ChunkedStringArray, + values: ChunkedFloat32Array, + } + + let table = batches + .iter() + .fold(MyTable::default(), |mut table_acc, batch| { + batch.columns().iter().enumerate().for_each(|(i, array)| { + match batch.schema().field(i).name().as_str() { + "label" => table_acc.labels.push(array.as_string().clone()), + "value" => { + table_acc.values.push(array.as_primitive().clone()); + } + _ => unreachable!(), + } + }); + table_acc + }); + + // first flatten is from Vec> to Vec, second is from Vec> to Vec + let labels: Vec<&str> = table.labels.iter().flatten().flatten().collect(); + let values: Vec = table.values.iter().flatten().flatten().collect(); + + assert_eq!(labels, ["A", "B", "C", "D", "E"]); + assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); +} diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 5002e5bf181a..8a9e4f5ac33e 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -345,6 +345,67 @@ //! orchestrates the primitives exported by this crate into an embeddable query engine, with //! SQL and DataFrame frontends, and heavily influences this crate's roadmap. //! +//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python +//! and C++ Arrow implementations. The recommended alternative is to use one of the following: +//! - `Vec` a simple, eager version of a `ChunkedArray` +//! - `impl Iterator` a lazy version of a `ChunkedArray` +//! - `impl Stream` a lazy async version of a `ChunkedArray` +//! +//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes +//! extensive use of [RecordBatchStream]. +//! +//! This approach integrates well into the Rust ecosystem, simplifies the implementation and +//! encourages the use of performant lazy and async patterns. +//! +//! Aside from providing a slightly less convenient API, one other downside is the lack of support +//! for processing compute kernels across chunked arrays. But this use case is well-supported by +//! [DataFusion]. +//! +//! The iterator API makes it ergonomic to work with these patterns: +//! ```rust +//! use arrow::array::{as_string_array, ArrayRef, AsArray, Float32Array, StringArray}; +//! use arrow::record_batch::RecordBatch; +//! use std::sync::Arc; +//! use arrow::datatypes::Float32Type; +//! +//! let batches = [ +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef), +//! ]).unwrap(), +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef), +//! ]).unwrap(), +//! ]; +//! +//! // chunked_array_by_index is an array of two Vec where each Vec is a column +//! let mut chunked_array_by_index = [Vec::new(), Vec::new()]; +//! for batch in &batches { +//! for (i, array) in batch.columns().iter().enumerate() { +//! chunked_array_by_index[i].push(array.clone()); +//! } +//! } +//! +//! // downcast and iterate over the values - column 0 is the labels and column 1 is the values +//! let labels: Vec<&str> = chunked_array_by_index[0] +//! .iter() +//! .flat_map(|x|as_string_array(x).iter()) // flatten and downcast to StringArray +//! .flatten() // flatten the Option to String +//! .collect(); +//! +//! let values: Vec = chunked_array_by_index[1] +//! .iter() +//! .flat_map(|x|x.as_primitive::().iter()) +//! .flatten() +//! .collect(); +//! +//! assert_eq!(labels, ["A", "B", "C", "D", "E"]); +//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); +//! +//!``` +//! See the [chunked_arrays example] for an example using a struct of typed chunks. +//! //! [`arrow`]: https://github.com/apache/arrow-rs //! [`array`]: mod@array //! [`Array`]: array::Array @@ -361,6 +422,8 @@ //! [Apache Parquet]: https://parquet.apache.org/ //! [DataFusion]: https://github.com/apache/arrow-datafusion //! [issue tracker]: https://github.com/apache/arrow-rs/issues +//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html +//! [chunked_arrays example]: ../examples/chunked_arrays.rs #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] From f53cb75f274ae599ff2ba7eff84fcad54d001b50 Mon Sep 17 00:00:00 2001 From: Eric Fredine Date: Tue, 8 Oct 2024 11:21:13 -0700 Subject: [PATCH 2/5] Remove link to example. --- arrow/src/lib.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 8a9e4f5ac33e..837bff81c951 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -404,7 +404,7 @@ //! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); //! //!``` -//! See the [chunked_arrays example] for an example using a struct of typed chunks. +//! See `examples/chunked_arrays.rs` for an example with a struct of typed arrays. //! //! [`arrow`]: https://github.com/apache/arrow-rs //! [`array`]: mod@array @@ -423,7 +423,6 @@ //! [DataFusion]: https://github.com/apache/arrow-datafusion //! [issue tracker]: https://github.com/apache/arrow-rs/issues //! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html -//! [chunked_arrays example]: ../examples/chunked_arrays.rs #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] From c1e8ed55ef40e3df3013aa8056537d748ef7c315 Mon Sep 17 00:00:00 2001 From: Eric Fredine Date: Tue, 8 Oct 2024 11:23:43 -0700 Subject: [PATCH 3/5] Reduce width of doc example --- arrow/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 837bff81c951..9134eec76e38 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -379,7 +379,7 @@ //! ]).unwrap(), //! ]; //! -//! // chunked_array_by_index is an array of two Vec where each Vec is a column +//! // chunked_array_by_index is an array of two Vec //! let mut chunked_array_by_index = [Vec::new(), Vec::new()]; //! for batch in &batches { //! for (i, array) in batch.columns().iter().enumerate() { From 96f14fa6f29929a5f939869a567681103a5b8d47 Mon Sep 17 00:00:00 2001 From: Eric Fredine Date: Tue, 8 Oct 2024 14:07:13 -0700 Subject: [PATCH 4/5] Move documentation to arrow-array. Simplify doc example. Remove top-level example. --- arrow-array/src/lib.rs | 50 +++++++++++++++ arrow/examples/README.md | 1 - arrow/examples/chunked_arrays.rs | 106 ------------------------------- arrow/src/lib.rs | 62 ------------------ 4 files changed, 50 insertions(+), 169 deletions(-) delete mode 100644 arrow/examples/chunked_arrays.rs diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 90bc5e31205a..88ace550f284 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -161,7 +161,55 @@ //! array.as_primitive::().values() //! } //! ``` +//! # Alternatives to ChunkedArray Support //! +//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python +//! and C++ Arrow implementations. The recommended alternative is to use one of the following: +//! - `Vec` a simple, eager version of a `ChunkedArray` +//! - `impl Iterator` a lazy version of a `ChunkedArray` +//! - `impl Stream` a lazy async version of a `ChunkedArray` +//! +//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes +//! extensive use of [RecordBatchStream]. +//! +//! This approach integrates well into the Rust ecosystem, simplifies the implementation and +//! encourages the use of performant lazy and async patterns. +//! +//! The iterator API makes it ergonomic to work with these patterns and since most kernels accept +//! `&dyn Array` many use-cases do not even require downcasting. +//! ```rust +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray}; +//! use arrow_array::cast::AsArray; +//! use arrow_array::types::Float32Type; +//! use arrow_schema::DataType; +//! +//! let batches = [ +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef), +//! ]).unwrap(), +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef), +//! ]).unwrap(), +//! ]; +//! +//! let labels: Vec<&str> = batches +//! .iter() +//! .flat_map(|batch| batch.column(0).as_string::()) +//! .map(Option::unwrap) +//! .collect(); +//! +//! let values: Vec = batches +//! .iter() +//! .flat_map(|batch| batch.column(1).as_primitive::().values()) +//! .copied() +//! .collect(); +//! +//! assert_eq!(labels, ["A", "B", "C", "D", "E"]); +//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); +//!``` //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`OffsetBuffer`]: arrow_buffer::OffsetBuffer @@ -173,6 +221,8 @@ //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html //! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html +//! [DataFusion]: https://github.com/apache/arrow-datafusion +//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html #![deny(rustdoc::broken_intra_doc_links)] #![warn(missing_docs)] diff --git a/arrow/examples/README.md b/arrow/examples/README.md index f33252355c0a..87aa6ee0475b 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -20,7 +20,6 @@ # Examples - [`builders.rs`](builders.rs): Using the Builder API -- [`chunked_arrays.rs`](chunked_arrays.rs): Using Vec to represent chunked arrays - [`collect.rs`](collect.rs): Using the `FromIter` API - [`dynamic_types.rs`](dynamic_types.rs): Dealing with mixed types dynamically at runtime - [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty printing Arrays diff --git a/arrow/examples/chunked_arrays.rs b/arrow/examples/chunked_arrays.rs deleted file mode 100644 index 59941466a55c..000000000000 --- a/arrow/examples/chunked_arrays.rs +++ /dev/null @@ -1,106 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -//! This example demonstrates using Vec as an alternative to ChunkedArray. -use arrow::array::{ArrayRef, AsArray, Float32Array, StringArray}; -use arrow::record_batch::RecordBatch; -use arrow_array::cast::as_string_array; -use arrow_array::types::Float32Type; -use std::sync::Arc; - -fn main() { - let batches = [ - RecordBatch::try_from_iter(vec![ - ( - "label", - Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef, - ), - ( - "value", - Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef, - ), - ]) - .unwrap(), - RecordBatch::try_from_iter(vec![ - ( - "label", - Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef, - ), - ( - "value", - Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef, - ), - ]) - .unwrap(), - ]; - - // chunked_array_by_index is an array of two Vec where each Vec is a column - let mut chunked_array_by_index = [Vec::new(), Vec::new()]; - for batch in &batches { - for (i, array) in batch.columns().iter().enumerate() { - chunked_array_by_index[i].push(array.clone()); - } - } - - // downcast and iterate over the values - column 0 is the labels and column 1 is the values - let labels: Vec<&str> = chunked_array_by_index[0] - .iter() - .flat_map(|x| as_string_array(x).iter()) - .flatten() // flatten the Option to String - .collect(); - - let values: Vec = chunked_array_by_index[1] - .iter() - .flat_map(|x| x.as_primitive::().iter()) - .flatten() // flatten the Option to f32 - .collect(); - - assert_eq!(labels, ["A", "B", "C", "D", "E"]); - assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); - - // Or you could use a struct with typed chunks and downcast as you gather them - type ChunkedStringArray = Vec; - type ChunkedFloat32Array = Vec; - - #[derive(Default)] - struct MyTable { - labels: ChunkedStringArray, - values: ChunkedFloat32Array, - } - - let table = batches - .iter() - .fold(MyTable::default(), |mut table_acc, batch| { - batch.columns().iter().enumerate().for_each(|(i, array)| { - match batch.schema().field(i).name().as_str() { - "label" => table_acc.labels.push(array.as_string().clone()), - "value" => { - table_acc.values.push(array.as_primitive().clone()); - } - _ => unreachable!(), - } - }); - table_acc - }); - - // first flatten is from Vec> to Vec, second is from Vec> to Vec - let labels: Vec<&str> = table.labels.iter().flatten().flatten().collect(); - let values: Vec = table.values.iter().flatten().flatten().collect(); - - assert_eq!(labels, ["A", "B", "C", "D", "E"]); - assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); -} diff --git a/arrow/src/lib.rs b/arrow/src/lib.rs index 9134eec76e38..5002e5bf181a 100644 --- a/arrow/src/lib.rs +++ b/arrow/src/lib.rs @@ -345,67 +345,6 @@ //! orchestrates the primitives exported by this crate into an embeddable query engine, with //! SQL and DataFrame frontends, and heavily influences this crate's roadmap. //! -//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python -//! and C++ Arrow implementations. The recommended alternative is to use one of the following: -//! - `Vec` a simple, eager version of a `ChunkedArray` -//! - `impl Iterator` a lazy version of a `ChunkedArray` -//! - `impl Stream` a lazy async version of a `ChunkedArray` -//! -//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes -//! extensive use of [RecordBatchStream]. -//! -//! This approach integrates well into the Rust ecosystem, simplifies the implementation and -//! encourages the use of performant lazy and async patterns. -//! -//! Aside from providing a slightly less convenient API, one other downside is the lack of support -//! for processing compute kernels across chunked arrays. But this use case is well-supported by -//! [DataFusion]. -//! -//! The iterator API makes it ergonomic to work with these patterns: -//! ```rust -//! use arrow::array::{as_string_array, ArrayRef, AsArray, Float32Array, StringArray}; -//! use arrow::record_batch::RecordBatch; -//! use std::sync::Arc; -//! use arrow::datatypes::Float32Type; -//! -//! let batches = [ -//! RecordBatch::try_from_iter(vec![ -//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef), -//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef), -//! ]).unwrap(), -//! RecordBatch::try_from_iter(vec![ -//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef), -//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef), -//! ]).unwrap(), -//! ]; -//! -//! // chunked_array_by_index is an array of two Vec -//! let mut chunked_array_by_index = [Vec::new(), Vec::new()]; -//! for batch in &batches { -//! for (i, array) in batch.columns().iter().enumerate() { -//! chunked_array_by_index[i].push(array.clone()); -//! } -//! } -//! -//! // downcast and iterate over the values - column 0 is the labels and column 1 is the values -//! let labels: Vec<&str> = chunked_array_by_index[0] -//! .iter() -//! .flat_map(|x|as_string_array(x).iter()) // flatten and downcast to StringArray -//! .flatten() // flatten the Option to String -//! .collect(); -//! -//! let values: Vec = chunked_array_by_index[1] -//! .iter() -//! .flat_map(|x|x.as_primitive::().iter()) -//! .flatten() -//! .collect(); -//! -//! assert_eq!(labels, ["A", "B", "C", "D", "E"]); -//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); -//! -//!``` -//! See `examples/chunked_arrays.rs` for an example with a struct of typed arrays. -//! //! [`arrow`]: https://github.com/apache/arrow-rs //! [`array`]: mod@array //! [`Array`]: array::Array @@ -422,7 +361,6 @@ //! [Apache Parquet]: https://parquet.apache.org/ //! [DataFusion]: https://github.com/apache/arrow-datafusion //! [issue tracker]: https://github.com/apache/arrow-rs/issues -//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html #![deny(clippy::redundant_clone)] #![warn(missing_debug_implementations)] From 5ef40abe03f958753955d68876850686804c07ff Mon Sep 17 00:00:00 2001 From: Raphael Taylor-Davies <1781103+tustvold@users.noreply.github.com> Date: Thu, 10 Oct 2024 21:44:20 +0100 Subject: [PATCH 5/5] Update arrow-array/src/lib.rs --- arrow-array/src/lib.rs | 3 --- 1 file changed, 3 deletions(-) diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 88ace550f284..0fc9d30ab6e3 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -174,9 +174,6 @@ //! //! This approach integrates well into the Rust ecosystem, simplifies the implementation and //! encourages the use of performant lazy and async patterns. -//! -//! The iterator API makes it ergonomic to work with these patterns and since most kernels accept -//! `&dyn Array` many use-cases do not even require downcasting. //! ```rust //! use std::sync::Arc; //! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray};