diff --git a/arrow-array/src/lib.rs b/arrow-array/src/lib.rs index 90bc5e31205a..0fc9d30ab6e3 100644 --- a/arrow-array/src/lib.rs +++ b/arrow-array/src/lib.rs @@ -161,7 +161,52 @@ //! array.as_primitive::().values() //! } //! ``` +//! # Alternatives to ChunkedArray Support //! +//! The Rust implementation does not provide the ChunkedArray abstraction implemented by the Python +//! and C++ Arrow implementations. The recommended alternative is to use one of the following: +//! - `Vec` a simple, eager version of a `ChunkedArray` +//! - `impl Iterator` a lazy version of a `ChunkedArray` +//! - `impl Stream` a lazy async version of a `ChunkedArray` +//! +//! Similar patterns can be applied at the `RecordBatch` level. For example, [DataFusion] makes +//! extensive use of [RecordBatchStream]. +//! +//! This approach integrates well into the Rust ecosystem, simplifies the implementation and +//! encourages the use of performant lazy and async patterns. +//! ```rust +//! use std::sync::Arc; +//! use arrow_array::{ArrayRef, Float32Array, RecordBatch, StringArray}; +//! use arrow_array::cast::AsArray; +//! use arrow_array::types::Float32Type; +//! use arrow_schema::DataType; +//! +//! let batches = [ +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["A", "B", "C"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.1, 0.2, 0.3])) as ArrayRef), +//! ]).unwrap(), +//! RecordBatch::try_from_iter(vec![ +//! ("label", Arc::new(StringArray::from(vec!["D", "E"])) as ArrayRef), +//! ("value", Arc::new(Float32Array::from(vec![0.4, 0.5])) as ArrayRef), +//! ]).unwrap(), +//! ]; +//! +//! let labels: Vec<&str> = batches +//! .iter() +//! .flat_map(|batch| batch.column(0).as_string::()) +//! .map(Option::unwrap) +//! .collect(); +//! +//! let values: Vec = batches +//! .iter() +//! .flat_map(|batch| batch.column(1).as_primitive::().values()) +//! .copied() +//! .collect(); +//! +//! assert_eq!(labels, ["A", "B", "C", "D", "E"]); +//! assert_eq!(values, [0.1, 0.2, 0.3, 0.4, 0.5]); +//!``` //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`ScalarBuffer`]: arrow_buffer::ScalarBuffer //! [`OffsetBuffer`]: arrow_buffer::OffsetBuffer @@ -173,6 +218,8 @@ //! [`compute`]: https://docs.rs/arrow/latest/arrow/compute/index.html //! [`json`]: https://docs.rs/arrow/latest/arrow/json/index.html //! [`csv`]: https://docs.rs/arrow/latest/arrow/csv/index.html +//! [DataFusion]: https://github.com/apache/arrow-datafusion +//! [RecordBatchStream]: https://docs.rs/datafusion/latest/datafusion/execution/trait.RecordBatchStream.html #![deny(rustdoc::broken_intra_doc_links)] #![warn(missing_docs)] diff --git a/arrow/examples/README.md b/arrow/examples/README.md index 5c57ec00cd76..87aa6ee0475b 100644 --- a/arrow/examples/README.md +++ b/arrow/examples/README.md @@ -21,7 +21,7 @@ - [`builders.rs`](builders.rs): Using the Builder API - [`collect.rs`](collect.rs): Using the `FromIter` API -- [`dynamic_types.rs`](dynamic_types.rs): +- [`dynamic_types.rs`](dynamic_types.rs): Dealing with mixed types dynamically at runtime - [`read_csv.rs`](read_csv.rs): Reading CSV files with explicit schema, pretty printing Arrays - [`read_csv_infer_schema.rs`](read_csv_infer_schema.rs): Reading CSV files, pretty printing Arrays - [`tensor_builder.rs`](tensor_builder.rs): Using tensor builder