From 366628230249f73f9a9de5fac728c292ec83da8c Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Feb 2024 05:27:59 -0500 Subject: [PATCH 1/4] Update readme with arrow examples --- Readme.md | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 3 deletions(-) diff --git a/Readme.md b/Readme.md index 48169099..ed94fdf7 100644 --- a/Readme.md +++ b/Readme.md @@ -32,7 +32,39 @@ arrays, and deserialization from arrays to Rust structs. [datafusion]: https://github.com/apache/arrow-datafusion/ ## Example +Given this Rust structure +```rust +#[derive(Serialize, Deserialize)] +struct Record { + a: f32, + b: i32, +} + +let records = vec![ + Record { a: 1.0, b: 1 }, + Record { a: 2.0, b: 2 }, + Record { a: 3.0, b: 3 }, +]; +``` +### Serialize to `arrow` `RecordBatch` +```rust +use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; + +// Determine Arrow schema +let fields = + SerdeArrowSchema::from_type::(TracingOptions::default())? + .to_arrow_fields() + +// Convert to Arrow arrays +let arrays = serde_arrow::to_arrow(&fields, &records)?; + +// Form a RecordBatch +let schema = Schema::new(&fields); +let batch = RecordBatch::try_new(schema.into(), arrays)?; +``` + +### Serialize to `arrow2` arrays ```rust use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; @@ -55,7 +87,27 @@ let fields = let arrays = serde_arrow::to_arrow2(&fields, &records)?; ``` -These arrays can now be written to disk using the helper method defined in the +These arrays can now be written to disk in formats such as Parquet using the +appropriate Arrow or Arrow2 APIs. + +### Write `arrow` `RecordBatch` to Parquet + +You can write the `RecordBatch` to a Parquet file using [ArrowWriter] from the +[parquet] crate: + +[ArrowWriter]: https://docs.rs/parquet/latest/parquet/arrow/arrow_writer/struct.ArrowWriter.html +[parquet]: https://docs.rs/parquet/latest/parquet/ + + +```rust +let file = File::create("example.pq"); +let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +writer.write(&batch)?; +writer.close()?; +``` + +### Write `arrow2` arrays to Parquet +using the helper method defined in the [arrow2 guide][arrow2-guide]. For parquet: ```rust,ignore @@ -71,14 +123,30 @@ write_chunk( The written file can now be read in Python via +### Polars ```python -# using polars import polars as pl pl.read_parquet("example.pq") +shape: (3, 2) +┌─────┬─────┐ +│ a ┆ b │ +│ --- ┆ --- │ +│ f32 ┆ i32 │ +╞═════╪═════╡ +│ 1.0 ┆ 1 │ +│ 2.0 ┆ 2 │ +│ 3.0 ┆ 3 │ +└─────┴─────┘ +``` -# using pandas +### Pandas +```python import pandas as pd pd.read_parquet("example.pq") + a b +0 1.0 1 +1 2.0 2 +2 3.0 3 ``` [arrow2-guide]: https://jorgecarleitao.github.io/arrow2 From b966b4c14e6a3682d07d0b9df873cd16344bcc18 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Feb 2024 05:37:54 -0500 Subject: [PATCH 2/4] Add doc example --- serde_arrow/src/lib.rs | 44 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 2c1c0e01..75b336c8 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -45,7 +45,49 @@ //! - the [status summary][_impl::docs::status] for an overview over the //! supported Arrow and Rust constructs //! -//! ## Example +//! ## `arrow` Example +//! ```rust +//! # use serde::{Deserialize, Serialize}; +//! # #[cfg(feature = "has_arrow")] +//! # fn main() -> serde_arrow::Result<()> { +//! use arrow::datatypes::Schema; +//! use arrow::record_batch::RecordBatch; +//! use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; +//! +//! ##[derive(Serialize, Deserialize)] +//! struct Record { +//! a: f32, +//! b: i32, +//! } +//! +//! let records = vec![ +//! Record { a: 1.0, b: 1 }, +//! Record { a: 2.0, b: 2 }, +//! Record { a: 3.0, b: 3 }, +//! ]; +//! +//! // Determine Arrow schema +//! let fields = Vec::::from_type::(TracingOptions::default())?; +//! +//! // Convert Rust records to Arrow arrays +//! let arrays = serde_arrow::to_arrow(&fields, &records)?; +//! +//! // Create RecordBatch +//! let schema = Schema::new(fields); +//! let batch = RecordBatch::try_new(schema, arrays)?; +//! # Ok(()) +//! # } +//! # #[cfg(not(feature = "has_arrow"))] +//! # fn main() { } +//! ``` +//! +//! The `RecordBatch` can then be written to disk, e.g., as parquet using +//! the [`ArrowWriter`] from the [`parquet`] crate: +//! +//! [`ArrowWriter`]: https://docs.rs/parquet/latest/parquet/arrow/arrow_writer/struct.ArrowWriter.html +//! [`parquet`]: https://docs.rs/parquet/latest/parquet/ +//! +//! ## `arrow2` Example //! //! Requires one of `arrow2` feature (see below). //! From 0833eb072d20089275ca52b9216d99f4fa582531 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Feb 2024 19:55:38 -0500 Subject: [PATCH 3/4] Improve heading to content ratio, reorganize content --- Readme.md | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) diff --git a/Readme.md b/Readme.md index ed94fdf7..338542e2 100644 --- a/Readme.md +++ b/Readme.md @@ -64,6 +64,20 @@ let schema = Schema::new(&fields); let batch = RecordBatch::try_new(schema.into(), arrays)?; ``` +This `RecordBatch` can now be written to disk using [ArrowWriter] from the [parquet] crate. + +[ArrowWriter]: https://docs.rs/parquet/latest/parquet/arrow/arrow_writer/struct.ArrowWriter.html +[parquet]: https://docs.rs/parquet/latest/parquet/ + + +```rust +let file = File::create("example.pq"); +let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; +writer.write(&batch)?; +writer.close()?; +``` + + ### Serialize to `arrow2` arrays ```rust use serde_arrow::schema::{TracingOptions, SerdeArrowSchema}; @@ -87,27 +101,7 @@ let fields = let arrays = serde_arrow::to_arrow2(&fields, &records)?; ``` -These arrays can now be written to disk in formats such as Parquet using the -appropriate Arrow or Arrow2 APIs. - -### Write `arrow` `RecordBatch` to Parquet - -You can write the `RecordBatch` to a Parquet file using [ArrowWriter] from the -[parquet] crate: - -[ArrowWriter]: https://docs.rs/parquet/latest/parquet/arrow/arrow_writer/struct.ArrowWriter.html -[parquet]: https://docs.rs/parquet/latest/parquet/ - - -```rust -let file = File::create("example.pq"); -let mut writer = ArrowWriter::try_new(file, batch.schema(), None)?; -writer.write(&batch)?; -writer.close()?; -``` - -### Write `arrow2` arrays to Parquet -using the helper method defined in the +These arrays can now be written to disk using the helper method defined in the [arrow2 guide][arrow2-guide]. For parquet: ```rust,ignore @@ -121,12 +115,14 @@ write_chunk( )?; ``` +### Usage from python + The written file can now be read in Python via -### Polars ```python -import polars as pl -pl.read_parquet("example.pq") +# using polars +>>> import polars as pl +>>> pl.read_parquet("example.pq") shape: (3, 2) ┌─────┬─────┐ │ a ┆ b │ @@ -137,12 +133,10 @@ shape: (3, 2) │ 2.0 ┆ 2 │ │ 3.0 ┆ 3 │ └─────┴─────┘ -``` -### Pandas -```python -import pandas as pd -pd.read_parquet("example.pq") +# using pandas +>>> import pandas as pd +>>> pd.read_parquet("example.pq") a b 0 1.0 1 1 2.0 2 From 55747a483ce4d4d284bdeeab0fe84f6f424030c4 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Mon, 5 Feb 2024 19:56:44 -0500 Subject: [PATCH 4/4] Improve punctuation --- serde_arrow/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serde_arrow/src/lib.rs b/serde_arrow/src/lib.rs index 75b336c8..4ebd955f 100644 --- a/serde_arrow/src/lib.rs +++ b/serde_arrow/src/lib.rs @@ -82,7 +82,7 @@ //! ``` //! //! The `RecordBatch` can then be written to disk, e.g., as parquet using -//! the [`ArrowWriter`] from the [`parquet`] crate: +//! the [`ArrowWriter`] from the [`parquet`] crate. //! //! [`ArrowWriter`]: https://docs.rs/parquet/latest/parquet/arrow/arrow_writer/struct.ArrowWriter.html //! [`parquet`]: https://docs.rs/parquet/latest/parquet/