-
Notifications
You must be signed in to change notification settings - Fork 831
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Raw JSON writer (~10x faster) (#5314) #5318
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,28 +20,6 @@ | |
//! This JSON writer converts Arrow [`RecordBatch`]es into arrays of | ||
//! JSON objects or JSON formatted byte streams. | ||
//! | ||
//! ## Writing JSON Objects | ||
//! | ||
//! To serialize [`RecordBatch`]es into array of | ||
//! [JSON](https://docs.serde.rs/serde_json/) objects, use | ||
//! [`record_batches_to_json_rows`]: | ||
//! | ||
//! ``` | ||
//! # use std::sync::Arc; | ||
//! # use arrow_array::{Int32Array, RecordBatch}; | ||
//! # use arrow_schema::{DataType, Field, Schema}; | ||
//! | ||
//! let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]); | ||
//! let a = Int32Array::from(vec![1, 2, 3]); | ||
//! let batch = RecordBatch::try_new(Arc::new(schema), vec![Arc::new(a)]).unwrap(); | ||
//! | ||
//! let json_rows = arrow_json::writer::record_batches_to_json_rows(&[&batch]).unwrap(); | ||
//! assert_eq!( | ||
//! serde_json::Value::Object(json_rows[1].clone()), | ||
//! serde_json::json!({"a": 2}), | ||
//! ); | ||
//! ``` | ||
//! | ||
//! ## Writing JSON formatted byte streams | ||
//! | ||
//! To serialize [`RecordBatch`]es into line-delimited JSON bytes, use | ||
|
@@ -97,6 +75,8 @@ | |
//! In order to explicitly write null values for keys, configure a custom [`Writer`] by | ||
//! using a [`WriterBuilder`] to construct a [`Writer`]. | ||
|
||
mod encoder; | ||
|
||
use std::iter; | ||
use std::{fmt::Debug, io::Write}; | ||
|
||
|
@@ -109,7 +89,9 @@ use arrow_array::types::*; | |
use arrow_array::*; | ||
use arrow_schema::*; | ||
|
||
use crate::writer::encoder::EncoderOptions; | ||
use arrow_cast::display::{ArrayFormatter, FormatOptions}; | ||
use encoder::make_encoder; | ||
|
||
fn primitive_array_to_json<T>(array: &dyn Array) -> Result<Vec<Value>, ArrowError> | ||
where | ||
|
@@ -481,6 +463,7 @@ fn set_column_for_json_rows( | |
|
||
/// Converts an arrow [`RecordBatch`] into a `Vec` of Serde JSON | ||
/// [`JsonMap`]s (objects) | ||
#[deprecated(note = "Use Writer")] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I can't figure out if the deprecation is needed for the new json writer, or did you just include it in the same PR for convenience? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I lumped the deprecation into this PR as moving the writer over to mainly not use this functionality means a reduction in our test coverage of it |
||
pub fn record_batches_to_json_rows( | ||
batches: &[&RecordBatch], | ||
) -> Result<Vec<JsonMap<String, Value>>, ArrowError> { | ||
|
@@ -597,11 +580,7 @@ pub type ArrayWriter<W> = Writer<W, JsonArray>; | |
|
||
/// JSON writer builder. | ||
#[derive(Debug, Clone, Default)] | ||
pub struct WriterBuilder { | ||
/// Controls whether null values should be written explicitly for keys | ||
/// in objects, or whether the key should be omitted entirely. | ||
explicit_nulls: bool, | ||
} | ||
pub struct WriterBuilder(EncoderOptions); | ||
|
||
impl WriterBuilder { | ||
/// Create a new builder for configuring JSON writing options. | ||
|
@@ -629,7 +608,7 @@ impl WriterBuilder { | |
|
||
/// Returns `true` if this writer is configured to keep keys with null values. | ||
pub fn explicit_nulls(&self) -> bool { | ||
self.explicit_nulls | ||
self.0.explicit_nulls | ||
} | ||
|
||
/// Set whether to keep keys with null values, or to omit writing them. | ||
|
@@ -654,7 +633,7 @@ impl WriterBuilder { | |
/// | ||
/// Default is to skip nulls (set to `false`). | ||
pub fn with_explicit_nulls(mut self, explicit_nulls: bool) -> Self { | ||
self.explicit_nulls = explicit_nulls; | ||
self.0.explicit_nulls = explicit_nulls; | ||
self | ||
} | ||
|
||
|
@@ -669,7 +648,7 @@ impl WriterBuilder { | |
started: false, | ||
finished: false, | ||
format: F::default(), | ||
explicit_nulls: self.explicit_nulls, | ||
options: self.0, | ||
} | ||
} | ||
} | ||
|
@@ -702,8 +681,8 @@ where | |
/// Determines how the byte stream is formatted | ||
format: F, | ||
|
||
/// Whether keys with null values should be written or skipped | ||
explicit_nulls: bool, | ||
/// Controls how JSON should be encoded, e.g. whether to write explicit nulls or skip them | ||
options: EncoderOptions, | ||
} | ||
|
||
impl<W, F> Writer<W, F> | ||
|
@@ -718,11 +697,12 @@ where | |
started: false, | ||
finished: false, | ||
format: F::default(), | ||
explicit_nulls: false, | ||
options: EncoderOptions::default(), | ||
} | ||
} | ||
|
||
/// Write a single JSON row to the output writer | ||
#[deprecated(note = "Use Writer::write")] | ||
pub fn write_row(&mut self, row: &Value) -> Result<(), ArrowError> { | ||
let is_first_row = !self.started; | ||
if !self.started { | ||
|
@@ -738,18 +718,48 @@ where | |
Ok(()) | ||
} | ||
|
||
/// Convert the `RecordBatch` into JSON rows, and write them to the output | ||
/// Serialize `batch` to JSON output | ||
pub fn write(&mut self, batch: &RecordBatch) -> Result<(), ArrowError> { | ||
for row in record_batches_to_json_rows_internal(&[batch], self.explicit_nulls)? { | ||
self.write_row(&Value::Object(row))?; | ||
if batch.num_rows() == 0 { | ||
return Ok(()); | ||
} | ||
|
||
// BufWriter uses a buffer size of 8KB | ||
// We therefore double this and flush once we have more than 8KB | ||
let mut buffer = Vec::with_capacity(16 * 1024); | ||
|
||
let mut is_first_row = !self.started; | ||
if !self.started { | ||
self.format.start_stream(&mut buffer)?; | ||
self.started = true; | ||
} | ||
|
||
let array = StructArray::from(batch.clone()); | ||
let mut encoder = make_encoder(&array, &self.options)?; | ||
|
||
for idx in 0..batch.num_rows() { | ||
self.format.start_row(&mut buffer, is_first_row)?; | ||
is_first_row = false; | ||
|
||
encoder.encode(idx, &mut buffer); | ||
if buffer.len() > 8 * 1024 { | ||
self.writer.write_all(&buffer)?; | ||
buffer.clear(); | ||
} | ||
self.format.end_row(&mut buffer)?; | ||
} | ||
|
||
if !buffer.is_empty() { | ||
self.writer.write_all(&buffer)?; | ||
} | ||
|
||
Ok(()) | ||
} | ||
|
||
/// Convert the [`RecordBatch`] into JSON rows, and write them to the output | ||
/// Serialize `batches` to JSON output | ||
pub fn write_batches(&mut self, batches: &[&RecordBatch]) -> Result<(), ArrowError> { | ||
for row in record_batches_to_json_rows_internal(batches, self.explicit_nulls)? { | ||
self.write_row(&Value::Object(row))?; | ||
for b in batches { | ||
self.write(b)?; | ||
} | ||
Ok(()) | ||
} | ||
|
@@ -1453,6 +1463,7 @@ mod tests { | |
} | ||
|
||
#[test] | ||
#[allow(deprecated)] | ||
fn json_writer_one_row() { | ||
let mut writer = ArrayWriter::new(vec![] as Vec<u8>); | ||
let v = json!({ "an": "object" }); | ||
|
@@ -1465,6 +1476,7 @@ mod tests { | |
} | ||
|
||
#[test] | ||
#[allow(deprecated)] | ||
fn json_writer_two_rows() { | ||
let mut writer = ArrayWriter::new(vec![] as Vec<u8>); | ||
let v = json!({ "an": "object" }); | ||
|
@@ -1564,9 +1576,9 @@ mod tests { | |
r#"{"a":{"list":[1,2]},"b":{"list":[1,2]}} | ||
{"a":{"list":[null]},"b":{"list":[null]}} | ||
{"a":{"list":[]},"b":{"list":[]}} | ||
{"a":null,"b":{"list":[3,null]}} | ||
{"b":{"list":[3,null]}} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The prior behaviour feels like a bug to me, without explicit nulls set I would expect consistent use of implicit nulls. The fact that null objects happen to be treated differently to null primitives seems at best confusing. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This does seem like it was a bug previously, I'm just racking my brain to remember if I was aware of this before or not, if there was a reason for this 🤔 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think when I worked on #5133 I just forgot to consider my previous work for writing explicit nulls in #5065. This fix makes sense; the only case where we should write nulls if |
||
{"a":{"list":[4,5]},"b":{"list":[4,5]}} | ||
{"a":null,"b":{}} | ||
{"b":{}} | ||
{"a":{},"b":{}} | ||
"#, | ||
); | ||
|
@@ -1621,7 +1633,7 @@ mod tests { | |
assert_json_eq( | ||
&buf, | ||
r#"{"map":{"foo":10}} | ||
{"map":null} | ||
{} | ||
{"map":{}} | ||
{"map":{"bar":20,"baz":30,"qux":40}} | ||
{"map":{"quux":50}} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This functionality isn't removed, yet, but it is deprecated as I can't think of any reasonable use-cases for this. If you're wanting to embed arrow data in another JSON document, serde_json's raw value mechanism is an objectively better way to go about doing this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks like @houqp added it in d868cff many 🌔 's ago - perhaps he has some additional context.
I agree I can't really think of why this would be useful - it seems like it may be similar to wanting to convert RecordBatches into actual Rust
struct
s via serde but I can't remember how far we got with thatGiven I am not familiar with
serde_json's raw value mechanism
I suspect others may not be eitherPerhaps you can add a note here about writing JSON objects using serde and leave a link for readers to follow
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hey @tustvold, could you clarify on what the serde_json's raw value mechanism is you're thinking of?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
https://docs.rs/serde_json/latest/serde_json/value/struct.RawValue.html
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah, this isn't clear to me either as I mentioned in the original review -- I made a PR to add an example showing how to use this: #5364